From 6633a17f00e0122f62e9fb39e42515df0e295793 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Mon, 4 Nov 2024 18:46:19 -0800 Subject: [PATCH 01/32] Add Pulumi deploy script --- pulumi/Pulumi.yaml | 2 + pulumi/README.md | 57 ++++++++ pulumi/__main__.py | 197 ++++++++++++++++++++++++++ pulumi/post_deploy.sh | 29 ++++ pulumi/util.py | 49 +++++++ pulumi/vars.example.py | 16 +++ pulumi/zinit/zdb.yaml | 10 ++ pulumi/zinit/zdbfs.yaml | 2 + pulumi/zinit/zstor.yaml | 6 + pulumi/zstor_config.base.example.toml | 20 +++ 10 files changed, 388 insertions(+) create mode 100644 pulumi/Pulumi.yaml create mode 100644 pulumi/README.md create mode 100644 pulumi/__main__.py create mode 100644 pulumi/post_deploy.sh create mode 100644 pulumi/util.py create mode 100644 pulumi/vars.example.py create mode 100644 pulumi/zinit/zdb.yaml create mode 100644 pulumi/zinit/zdbfs.yaml create mode 100644 pulumi/zinit/zstor.yaml create mode 100644 pulumi/zstor_config.base.example.toml diff --git a/pulumi/Pulumi.yaml b/pulumi/Pulumi.yaml new file mode 100644 index 0000000..d2beeb7 --- /dev/null +++ b/pulumi/Pulumi.yaml @@ -0,0 +1,2 @@ +name: qsfs +runtime: python diff --git a/pulumi/README.md b/pulumi/README.md new file mode 100644 index 0000000..ec690d3 --- /dev/null +++ b/pulumi/README.md @@ -0,0 +1,57 @@ +# Deploy QSFS with Pulumi + +This is a Pulumi deployment script in Python that fully automates the setup of a QSFS instance. The following steps are required to use this script: + +1. Install Pulumi and Python on your system +2. Use Pip to install the Python dependencies +3. Copy and edit vars.py and zstor_config.base.toml + +Only Linux and MacOS are supported. If you run Windows, I'd recommend equipping yourself with a WSL environment. + +## Install Pulumi and Python + +We won't cover the details here. Probably your system already has `python3`. + +For Pulumi, check here: https://www.pulumi.com/docs/iac/download-install/ + +## Install Python dependencies + +We need some Python packages to make this work. Using a venv is recommended. + +``` +python -m venv .venv +source .venv/bin/activate +pip install pulumi pulumi_random pulumi_threefold +``` + +## Prep config + +Two config files are needed. Examples are included here. Copy the examples to the expected paths, then edit the files according to your needs. + +``` +cp vars.example.py vars.py +cp zstor_config.base.example.toml zstor_config.base.toml + +$EDITOR vars.py +$EDITOR zstor_config.base.toml +``` + +## Deploy + +Prior to using Pulumi, you need to login. There are some options here, which you can read about, but the simplest thing is to just use `--local`: + +``` +pulumi login --local +``` + +Now we can bring up the deployment. Create a stack when prompted with your name of choice. + +``` +pulumi up +``` + +If you want to destroy the deployment, bring it down like this: + +``` +pulumi down +``` diff --git a/pulumi/__main__.py b/pulumi/__main__.py new file mode 100644 index 0000000..2d678f9 --- /dev/null +++ b/pulumi/__main__.py @@ -0,0 +1,197 @@ +import os +import secrets +import shutil +import textwrap +import pulumi +import pulumi_random +import pulumi_threefold as threefold + +import util + +# It's up to the user to create their own vars.py before trying to deploy +try: + import vars +except ModuleNotFoundError: + exit("vars.py not found. Exiting.") + +# Same for the base zstor config. Exit if the user didn't provide this +ZSTOR_CONFIG_BASE = "zstor_config.base.toml" +ZSTOR_CONFIG = "zstor_config.toml" + +if not os.path.exists(ZSTOR_CONFIG_BASE): + exit("zstor_config.base.toml not found. Exiting.") + + +# Path of the script that will run on the deployed VM after deployment +# Installs needed binaries and starts up all the services +POST_DEPLOY_SCRIPT = "post_deploy.sh" + +# If a node has IPv6, then it will be the first IP in the zdb IP list +# Mycelium will always be last, but this could be index 1 or 2 +ZDB_IP6_INDEX = 0 +ZDB_MYC_INDEX = -1 + +# From here are all the parameters for the deployment +MNEMONIC = vars.MNEMONIC +NETWORK = vars.NETWORK + +with open(os.path.expanduser("~/.ssh/id_rsa.pub")) as file: + SSH_KEY = file.read() + +VM_NODE = vars.VM_NODE +FLIST = "https://hub.grid.tf/tf-official-apps/threefoldtech-ubuntu-22.04.flist" +CPU = 1 +RAM = 2048 # MB +ROOTFS = 1024 * 15 # MB +NET_NAME = "net" + +META_NODES = vars.META_NODES +DATA_NODES = vars.DATA_NODES +DATA_SIZE = vars.DATA_SIZE +META_SIZE = 1 + +# Generate separate secrets for Zstor key and Zdb namespaces passwords +ZSTOR_KEY = secrets.token_hex(32) +ZDB_PW = secrets.token_urlsafe(32) +zstor_key = pulumi_random.RandomBytes("zstor_key", length=32) +zdb_pw = pulumi_random.RandomPassword("zdb_pw", length=20) + +if vars.ZDB_CONNECTION == "ipv6": + ZDB_INDEX = ZDB_IP6_INDEX +elif vars.ZDB_CONNECTION == "mycelium": + ZDB_INDEX = ZDB_MYC_INDEX + +provider = threefold.Provider("provider", mnemonic=MNEMONIC, network=NETWORK) + +network = threefold.Network( + "network", + name=NET_NAME, + description="A network", + nodes=[VM_NODE], + ip_range="10.1.0.0/16", + mycelium=True, + opts=pulumi.ResourceOptions(provider=provider), +) + +nodes = set([VM_NODE] + META_NODES + DATA_NODES) + +deployments = {} + +for node in nodes: + net_name = "" + vms = [] + depends = [] + if node == VM_NODE: + net_name = NET_NAME + depends.append(network) + vms.append( + threefold.VMInputArgs( + name="vm", + flist=FLIST, + entrypoint="/sbin/zinit init", + network_name=net_name, + cpu=CPU, + memory=RAM, + rootfs_size=ROOTFS, + mycelium=True, + planetary=True, + public_ip6=True, + env_vars={ + "SSH_KEY": SSH_KEY, + }, + ) + ) + zdbs = [] + if node in DATA_NODES: + zdbs.append( + threefold.ZDBInputArgs( + name="data" + str(node), + size=DATA_SIZE, + mode="seq", + password=zdb_pw.result, + ) + ) + if node in META_NODES: + zdbs.append( + threefold.ZDBInputArgs( + name="meta" + str(node), + size=META_SIZE, + mode="user", + password=zdb_pw.result, + ) + ) + + deployments[node] = threefold.Deployment( + "deployment" + str(node), + node_id=node, + name="node" + str(node), + network_name=net_name, + vms=vms, + zdbs=zdbs, + opts=pulumi.ResourceOptions(provider=provider, depends_on=depends), + ) + + +def post_deploy(args): + # TODO: Don't overwrite existing file if it's there + # Actually, maybe it's okay as long as we have the secrets persisted + shutil.copy(ZSTOR_CONFIG_BASE, ZSTOR_CONFIG) + + meta_zdbs = [] + data_zdbs = [] + for vm_list, zdb_list in args["deployments"]: + if vm_list: + vm = vm_list[0] + + for zdb in zdb_list: + if "meta" in zdb["namespace"]: + meta_zdbs.append(zdb) + else: + data_zdbs.append(zdb) + meta_zdbs = sorted(meta_zdbs, key=lambda z: z["namespace"].split("-")[-1]) + data_zdbs = sorted(data_zdbs, key=lambda z: z["namespace"].split("-")[-1]) + + with open(ZSTOR_CONFIG, "a") as file: + encryption_config = f""" + [encryption] + algorithm = "AES" + key = "{args['zstor_key']}" + + [meta.config.encryption] + algorithm = "AES" + key = "{args['zstor_key']}" + """ + file.write(textwrap.dedent(encryption_config)) + for zdb in meta_zdbs: + ip = zdb["ips"][ZDB_INDEX] + ns = zdb["namespace"] + file.write("[[meta.config.backends]]\n") + file.write(f'address = "[{ip}]:9900"\n') + file.write(f'namespace = "{ns}"\n') + file.write(f'password = "{args['zdb_pw']}"\n\n') + + file.write("[[groups]]\n") + for zdb in data_zdbs: + ip = zdb["ips"][ZDB_INDEX] + ns = zdb["namespace"] + file.write("[[groups.backends]]\n") + file.write(f'address = "[{ip}]:9900"\n') + file.write(f'namespace = "{ns}"\n') + file.write(f'password = "{args['zdb_pw']}"\n\n') + + # ssh_ip = vm["mycelium_ip"] + ssh_ip = vm["computed_ip6"].split("/")[0] + util.scp(ssh_ip, "zinit/", "/etc/") + util.scp(ssh_ip, ZSTOR_CONFIG, f"/etc/{ZSTOR_CONFIG}") + util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) + + +pulumi.Output.all( + deployments=[(d.vms_computed, d.zdbs_computed) for d in deployments.values()], + zstor_key=zstor_key.hex, + zdb_pw=zdb_pw.result, +).apply(post_deploy) + +vm = deployments[VM_NODE].vms_computed[0] +pulumi.export("mycelium_ip", vm.mycelium_ip) +pulumi.export("pub_ipv6", vm.computed_ip6) diff --git a/pulumi/post_deploy.sh b/pulumi/post_deploy.sh new file mode 100644 index 0000000..3000618 --- /dev/null +++ b/pulumi/post_deploy.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Primitive idempotency +zinit | grep -q zstor && exit + +# Grab binaries and hook script. Make sure that all are executable +wget -O /usr/local/bin/zdbfs https://github.com/threefoldtech/0-db-fs/releases/download/v0.1.11/zdbfs-0.1.11-amd64-linux-static +wget -O /usr/local/bin/zdb https://github.com/threefoldtech/0-db/releases/download/v2.0.8/zdb-2.0.8-linux-amd64-static +wget -O /bin/zstor https://github.com/threefoldtech/0-stor_v2/releases/download/v0.4.0/zstor_v2-x86_64-linux-musl +wget -O /usr/local/bin/zdb-hook.sh https://raw.githubusercontent.com/threefoldtech/quantum-storage/master/lib/zdb-hook.sh + +echo +echo Setting permissions for downloaded binaries +chmod +x /usr/local/bin/* /bin/zstor + +echo +echo Creating Zdbfs mountpoint +mkdir -p /mnt/qsfs + +echo +echo Starting up zinit services +zinit monitor zstor +zinit monitor zdb +zinit monitor zdbfs + +# Zdbfs will fail on first attempt because zdb isn't ready yet (could add a +# test to zdb to fix this, maybe using redis-cli, nc, or ss) +sleep 1 +zinit diff --git a/pulumi/util.py b/pulumi/util.py new file mode 100644 index 0000000..16aa39c --- /dev/null +++ b/pulumi/util.py @@ -0,0 +1,49 @@ +import subprocess + + +def run_script_ssh(ip, script): + counter = 1 + while True: + log_filename = f"ssh.{counter}.log" + try: + with open(log_filename, "x") as logfile: + subprocess.run( + [ + "ssh", + "-oStrictHostKeyChecking=accept-new", + "-oConnectionAttempts=5", + "root@" + ip, + # "bash", + # "-c", + script, + ], + stdout=logfile, + stderr=logfile, + ) + break + except FileExistsError: + counter += 1 + + +def scp(ip, source, destination): + # Meant for ipv6 + counter = 1 + while True: + log_filename = f"scp.{counter}.log" + try: + with open(log_filename, "x") as logfile: + subprocess.run( + [ + "scp", + "-r", + "-oStrictHostKeyChecking=accept-new", + "-oConnectionAttempts=5", + source, + f"root@[{ip}]:{destination}", + ], + stdout=logfile, + stderr=logfile, + ) + break + except FileExistsError: + counter += 1 diff --git a/pulumi/vars.example.py b/pulumi/vars.example.py new file mode 100644 index 0000000..00693d3 --- /dev/null +++ b/pulumi/vars.example.py @@ -0,0 +1,16 @@ +MNEMONIC = "your words here" +NETWORK = "test" + +# Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter +VM_NODE = 5 + +# Nodes to deploy Zdbs on +META_NODES = [1, 3, 5, 8] +DATA_NODES = [1, 3, 5, 8] + +# Size of each data backend Zdb +DATA_SIZE = 1 + +# Network used to connect to the backend zdbs +# ZDB_CONNECTION = "mycelium" +ZDB_CONNECTION = "ipv6" diff --git a/pulumi/zinit/zdb.yaml b/pulumi/zinit/zdb.yaml new file mode 100644 index 0000000..34ae96c --- /dev/null +++ b/pulumi/zinit/zdb.yaml @@ -0,0 +1,10 @@ +exec: | + /usr/local/bin/zdb \ + --index /data/index \ + --data /data/data \ + --logfile /var/log/zdb.log \ + --datasize 67108864 \ + --hook /usr/local/bin/zdb-hook.sh \ + --rotate 900 +shutdown_timeout: 60 +after: [zstor] diff --git a/pulumi/zinit/zdbfs.yaml b/pulumi/zinit/zdbfs.yaml new file mode 100644 index 0000000..c999d7d --- /dev/null +++ b/pulumi/zinit/zdbfs.yaml @@ -0,0 +1,2 @@ +exec: /usr/local/bin/zdbfs /mnt/qsfs -o autons +after: [zdb] diff --git a/pulumi/zinit/zstor.yaml b/pulumi/zinit/zstor.yaml new file mode 100644 index 0000000..0f1a98f --- /dev/null +++ b/pulumi/zinit/zstor.yaml @@ -0,0 +1,6 @@ +exec: | + /bin/zstor \ + -c /etc/zstor-default.toml \ + --log_file /var/log/zstor.log \ + monitor +shutdown_timeout: 300 diff --git a/pulumi/zstor_config.base.example.toml b/pulumi/zstor_config.base.example.toml new file mode 100644 index 0000000..76dad34 --- /dev/null +++ b/pulumi/zstor_config.base.example.toml @@ -0,0 +1,20 @@ +minimal_shards = 2 +expected_shards = 4 +redundant_groups = 0 +redundant_nodes = 0 +root = "/" +zdbfs_mountpoint = "/mnt/qsfs" +socket = "/tmp/zstor.sock" +prometheus_port = 9100 +zdb_data_dir_path = "/data/data/zdbfs-data/" +max_zdb_data_dir_size = 2560 + +[compression] +algorithm = "snappy" + +[meta] +type = "zdb" + +[meta.config] +prefix = "zstor-meta" + From becfd88aa537c42f957d399e35fae508cfdf7444 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 5 Nov 2024 20:12:34 -0800 Subject: [PATCH 02/32] Fix post_deploy, naming --- pulumi/__main__.py | 18 ++++++++++++------ pulumi/post_deploy.sh | 2 ++ pulumi/util.py | 29 ++++++++++++++++------------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 2d678f9..22620c8 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -17,6 +17,8 @@ # Same for the base zstor config. Exit if the user didn't provide this ZSTOR_CONFIG_BASE = "zstor_config.base.toml" ZSTOR_CONFIG = "zstor_config.toml" +# This path is hard coded in the Zdb hook script +ZSTOR_CONFIG_REMOTE = "/etc/zstor-default.toml" if not os.path.exists(ZSTOR_CONFIG_BASE): exit("zstor_config.base.toml not found. Exiting.") @@ -57,11 +59,15 @@ zdb_pw = pulumi_random.RandomPassword("zdb_pw", length=20) if vars.ZDB_CONNECTION == "ipv6": - ZDB_INDEX = ZDB_IP6_INDEX + ZDB_IP_INDEX = ZDB_IP6_INDEX elif vars.ZDB_CONNECTION == "mycelium": - ZDB_INDEX = ZDB_MYC_INDEX + ZDB_IP_INDEX = ZDB_MYC_INDEX -provider = threefold.Provider("provider", mnemonic=MNEMONIC, network=NETWORK) +provider = threefold.Provider( + "provider", + mnemonic=MNEMONIC, + network=NETWORK, +) network = threefold.Network( "network", @@ -163,7 +169,7 @@ def post_deploy(args): """ file.write(textwrap.dedent(encryption_config)) for zdb in meta_zdbs: - ip = zdb["ips"][ZDB_INDEX] + ip = zdb["ips"][ZDB_IP_INDEX] ns = zdb["namespace"] file.write("[[meta.config.backends]]\n") file.write(f'address = "[{ip}]:9900"\n') @@ -172,7 +178,7 @@ def post_deploy(args): file.write("[[groups]]\n") for zdb in data_zdbs: - ip = zdb["ips"][ZDB_INDEX] + ip = zdb["ips"][ZDB_IP_INDEX] ns = zdb["namespace"] file.write("[[groups.backends]]\n") file.write(f'address = "[{ip}]:9900"\n') @@ -182,7 +188,7 @@ def post_deploy(args): # ssh_ip = vm["mycelium_ip"] ssh_ip = vm["computed_ip6"].split("/")[0] util.scp(ssh_ip, "zinit/", "/etc/") - util.scp(ssh_ip, ZSTOR_CONFIG, f"/etc/{ZSTOR_CONFIG}") + util.scp(ssh_ip, ZSTOR_CONFIG, ZSTOR_CONFIG_REMOTE) util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) diff --git a/pulumi/post_deploy.sh b/pulumi/post_deploy.sh index 3000618..3622c9d 100644 --- a/pulumi/post_deploy.sh +++ b/pulumi/post_deploy.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -x + # Primitive idempotency zinit | grep -q zstor && exit diff --git a/pulumi/util.py b/pulumi/util.py index 16aa39c..d1a3978 100644 --- a/pulumi/util.py +++ b/pulumi/util.py @@ -7,19 +7,22 @@ def run_script_ssh(ip, script): log_filename = f"ssh.{counter}.log" try: with open(log_filename, "x") as logfile: - subprocess.run( - [ - "ssh", - "-oStrictHostKeyChecking=accept-new", - "-oConnectionAttempts=5", - "root@" + ip, - # "bash", - # "-c", - script, - ], - stdout=logfile, - stderr=logfile, - ) + with open(script, "r") as scriptfile: + script_contents = scriptfile.read() + subprocess.run( + [ + "ssh", + "-oStrictHostKeyChecking=accept-new", + "-oConnectionAttempts=5", + "root@" + ip, + "bash", + " -s", + ], + input=script_contents, + text=True, + stdout=logfile, + stderr=logfile, + ) break except FileExistsError: counter += 1 From 3d8017df3abdcb6602a2f8fdbcf615d7ec3b42a6 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 7 Nov 2024 19:03:05 -0800 Subject: [PATCH 03/32] Add rebuild test --- pulumi/tests/rebuild/Pulumi.yaml | 2 ++ pulumi/tests/rebuild/__main__.py | 1 + pulumi/tests/rebuild/post_deploy.sh | 1 + pulumi/tests/rebuild/run.sh | 6 ++++ pulumi/tests/rebuild/scripts/deploy.sh | 9 ++++++ pulumi/tests/rebuild/scripts/destroy.sh | 7 +++++ pulumi/tests/rebuild/scripts/redeploy.sh | 19 +++++++++++ pulumi/tests/rebuild/scripts/write_data.sh | 35 +++++++++++++++++++++ pulumi/tests/rebuild/util.py | 1 + pulumi/tests/rebuild/vars.new.py | 20 ++++++++++++ pulumi/tests/rebuild/vars.original.py | 20 ++++++++++++ pulumi/tests/rebuild/zinit | 1 + pulumi/tests/rebuild/zstor_config.base.toml | 20 ++++++++++++ 13 files changed, 142 insertions(+) create mode 100644 pulumi/tests/rebuild/Pulumi.yaml create mode 120000 pulumi/tests/rebuild/__main__.py create mode 120000 pulumi/tests/rebuild/post_deploy.sh create mode 100644 pulumi/tests/rebuild/run.sh create mode 100755 pulumi/tests/rebuild/scripts/deploy.sh create mode 100755 pulumi/tests/rebuild/scripts/destroy.sh create mode 100755 pulumi/tests/rebuild/scripts/redeploy.sh create mode 100755 pulumi/tests/rebuild/scripts/write_data.sh create mode 120000 pulumi/tests/rebuild/util.py create mode 100644 pulumi/tests/rebuild/vars.new.py create mode 100644 pulumi/tests/rebuild/vars.original.py create mode 120000 pulumi/tests/rebuild/zinit create mode 100644 pulumi/tests/rebuild/zstor_config.base.toml diff --git a/pulumi/tests/rebuild/Pulumi.yaml b/pulumi/tests/rebuild/Pulumi.yaml new file mode 100644 index 0000000..096294d --- /dev/null +++ b/pulumi/tests/rebuild/Pulumi.yaml @@ -0,0 +1,2 @@ +name: qsfs-rebuild-test +runtime: python diff --git a/pulumi/tests/rebuild/__main__.py b/pulumi/tests/rebuild/__main__.py new file mode 120000 index 0000000..74b2cfc --- /dev/null +++ b/pulumi/tests/rebuild/__main__.py @@ -0,0 +1 @@ +../../__main__.py \ No newline at end of file diff --git a/pulumi/tests/rebuild/post_deploy.sh b/pulumi/tests/rebuild/post_deploy.sh new file mode 120000 index 0000000..149f4fd --- /dev/null +++ b/pulumi/tests/rebuild/post_deploy.sh @@ -0,0 +1 @@ +../../post_deploy.sh \ No newline at end of file diff --git a/pulumi/tests/rebuild/run.sh b/pulumi/tests/rebuild/run.sh new file mode 100644 index 0000000..ac492e7 --- /dev/null +++ b/pulumi/tests/rebuild/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +scripts/deploy.sh +scripts/write_data.sh +scripts/redeploy.sh +scripts/destroy.sh diff --git a/pulumi/tests/rebuild/scripts/deploy.sh b/pulumi/tests/rebuild/scripts/deploy.sh new file mode 100755 index 0000000..0bd645d --- /dev/null +++ b/pulumi/tests/rebuild/scripts/deploy.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi stack init test + +cp vars.original.py vars.py +pulumi up -s test -y --non-interactive diff --git a/pulumi/tests/rebuild/scripts/destroy.sh b/pulumi/tests/rebuild/scripts/destroy.sh new file mode 100755 index 0000000..92d90e7 --- /dev/null +++ b/pulumi/tests/rebuild/scripts/destroy.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi down -s test -y --non-interactive +pulumi stack rm -yf test diff --git a/pulumi/tests/rebuild/scripts/redeploy.sh b/pulumi/tests/rebuild/scripts/redeploy.sh new file mode 100755 index 0000000..b13e8e7 --- /dev/null +++ b/pulumi/tests/rebuild/scripts/redeploy.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi stack init test + +cp vars.new.py vars.py +pulumi up -s test -y --non-interactive + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 ' + pkill zstor -SIGUSR1 + # Wait some time to let the rebuild process start. This sould be enough? + sleep 10 + # Output should show us if any data has been written to the new backends yet + zstor -c /etc/zstor-default.toml status +' diff --git a/pulumi/tests/rebuild/scripts/write_data.sh b/pulumi/tests/rebuild/scripts/write_data.sh new file mode 100755 index 0000000..9cad9c9 --- /dev/null +++ b/pulumi/tests/rebuild/scripts/write_data.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 ' + echo "===== Creating 10 test files with 100MB random data each =====" + # Create 10 files with 100mb random data + for i in {1..10}; do + echo "Creating file$i.dat..." + dd if=/dev/urandom of=file$i.dat bs=1M count=100 + done + + echo -e "\n===== Calculating MD5 checksums of source files =====" + # Calculate and print MD5 sum for each file + for i in {1..10}; do + md5sum file$i.dat + done + + echo -e "\n===== Installing pv tool for transfer monitoring =====" + apt update &> /dev/null && apt install -y pv &> /dev/null + + echo -e "\n===== Copying files to QSFS mount with progress monitoring =====" + # Copy files to the qsfs mount and check speed + for i in {1..10}; do + echo "Copying file$i.dat..." + pv -s 100m "file$i.dat" > "/mnt/qsfs/file$i.dat" + done + + echo -e "\n===== Checking zstor backend status =====" + # Print zstor status to give an idea of whether data was uploaded to backends + zstor -c /etc/zstor-default.toml status +' diff --git a/pulumi/tests/rebuild/util.py b/pulumi/tests/rebuild/util.py new file mode 120000 index 0000000..2174c92 --- /dev/null +++ b/pulumi/tests/rebuild/util.py @@ -0,0 +1 @@ +../../util.py \ No newline at end of file diff --git a/pulumi/tests/rebuild/vars.new.py b/pulumi/tests/rebuild/vars.new.py new file mode 100644 index 0000000..ba07c79 --- /dev/null +++ b/pulumi/tests/rebuild/vars.new.py @@ -0,0 +1,20 @@ +# These are the new values used in the test + +MNEMONIC = "" +NETWORK = "test" +# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub +SSH_KEY = "" + +# Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter +VM_NODE = 5 + +# Nodes to deploy Zdbs on +META_NODES = [1, 2, 3, 7] +DATA_NODES = META_NODES + +# Size of each data backend Zdb +DATA_SIZE = 1 + +# Network used to connect to the backend zdbs +# ZDB_CONNECTION = "mycelium" +ZDB_CONNECTION = "ipv6" diff --git a/pulumi/tests/rebuild/vars.original.py b/pulumi/tests/rebuild/vars.original.py new file mode 100644 index 0000000..f0958d2 --- /dev/null +++ b/pulumi/tests/rebuild/vars.original.py @@ -0,0 +1,20 @@ +# These are the original values used in the test + +MNEMONIC = "" +NETWORK = "test" +# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub +SSH_KEY = "" + +# Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter +VM_NODE = 5 + +# Nodes to deploy Zdbs on +META_NODES = [1, 2, 3, 5] +DATA_NODES = META_NODES + +# Size of each data backend Zdb +DATA_SIZE = 1 + +# Network used to connect to the backend zdbs +# ZDB_CONNECTION = "mycelium" +ZDB_CONNECTION = "ipv6" diff --git a/pulumi/tests/rebuild/zinit b/pulumi/tests/rebuild/zinit new file mode 120000 index 0000000..ed52882 --- /dev/null +++ b/pulumi/tests/rebuild/zinit @@ -0,0 +1 @@ +../../zinit/ \ No newline at end of file diff --git a/pulumi/tests/rebuild/zstor_config.base.toml b/pulumi/tests/rebuild/zstor_config.base.toml new file mode 100644 index 0000000..76dad34 --- /dev/null +++ b/pulumi/tests/rebuild/zstor_config.base.toml @@ -0,0 +1,20 @@ +minimal_shards = 2 +expected_shards = 4 +redundant_groups = 0 +redundant_nodes = 0 +root = "/" +zdbfs_mountpoint = "/mnt/qsfs" +socket = "/tmp/zstor.sock" +prometheus_port = 9100 +zdb_data_dir_path = "/data/data/zdbfs-data/" +max_zdb_data_dir_size = 2560 + +[compression] +algorithm = "snappy" + +[meta] +type = "zdb" + +[meta.config] +prefix = "zstor-meta" + From 260f51163b0bddc5c0dbd9d98c3e7f8a14b97820 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 7 Nov 2024 19:03:45 -0800 Subject: [PATCH 04/32] Use env vars, better SSH key discovery --- pulumi/__main__.py | 12 ++++++++---- pulumi/util.py | 22 ++++++++++++++++++++++ pulumi/vars.example.py | 3 +++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 22620c8..faba56d 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -2,6 +2,7 @@ import secrets import shutil import textwrap + import pulumi import pulumi_random import pulumi_threefold as threefold @@ -34,11 +35,12 @@ ZDB_MYC_INDEX = -1 # From here are all the parameters for the deployment -MNEMONIC = vars.MNEMONIC -NETWORK = vars.NETWORK +MNEMONIC = vars.MNEMONIC if vars.MNEMONIC else os.environ.get("MNEMONIC") +NETWORK = vars.NETWORK if vars.NETWORK else os.environ.get("NETWORK") -with open(os.path.expanduser("~/.ssh/id_rsa.pub")) as file: - SSH_KEY = file.read() +SSH_KEY = vars.SSH_KEY if vars.SSH_KEY else util.get_ssh_key_from_disk() +if not SSH_KEY: + SSH_KEY = input("Please enter your public SSH key: ") VM_NODE = vars.VM_NODE FLIST = "https://hub.grid.tf/tf-official-apps/threefoldtech-ubuntu-22.04.flist" @@ -75,6 +77,7 @@ description="A network", nodes=[VM_NODE], ip_range="10.1.0.0/16", + # TODO: for some reason the mycelium keys seem to get regenerated if we update the deployment (like replacing zdbs) mycelium=True, opts=pulumi.ResourceOptions(provider=provider), ) @@ -93,6 +96,7 @@ vms.append( threefold.VMInputArgs( name="vm", + node_id=node, flist=FLIST, entrypoint="/sbin/zinit init", network_name=net_name, diff --git a/pulumi/util.py b/pulumi/util.py index d1a3978..3f11b3c 100644 --- a/pulumi/util.py +++ b/pulumi/util.py @@ -1,3 +1,4 @@ +import os import subprocess @@ -50,3 +51,24 @@ def scp(ip, source, destination): break except FileExistsError: counter += 1 + + +def get_ssh_key_from_disk(): + key_paths = [ + os.path.expanduser("~/.ssh/id_rsa.pub"), + os.path.expanduser("~/.ssh/id_ed25519.pub"), + os.path.expanduser("~/.ssh/id_ecdsa.pub"), + os.path.expanduser("~/.ssh/id_dsa.pub"), + ] + + ssh_key = None + + for path in key_paths: + try: + with open(path) as file: + ssh_key = file.read() + break + except (FileNotFoundError, OSError): + continue + + return ssh_key diff --git a/pulumi/vars.example.py b/pulumi/vars.example.py index 00693d3..6bf2c2f 100644 --- a/pulumi/vars.example.py +++ b/pulumi/vars.example.py @@ -1,5 +1,8 @@ +# These can also be specified as env vars with the same names MNEMONIC = "your words here" NETWORK = "test" +# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub +SSH_KEY = "" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter VM_NODE = 5 From bfe95e06ff7366cf7bb079f509f3b536aa1f7d4f Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 7 Nov 2024 19:27:07 -0800 Subject: [PATCH 05/32] Fix typo --- pulumi/tests/rebuild/scripts/redeploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pulumi/tests/rebuild/scripts/redeploy.sh b/pulumi/tests/rebuild/scripts/redeploy.sh index b13e8e7..1a56b46 100755 --- a/pulumi/tests/rebuild/scripts/redeploy.sh +++ b/pulumi/tests/rebuild/scripts/redeploy.sh @@ -12,7 +12,7 @@ ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut ssh -t root@$ipv6 ' pkill zstor -SIGUSR1 - # Wait some time to let the rebuild process start. This sould be enough? + # Wait some time to let the rebuild process start. This should be enough? sleep 10 # Output should show us if any data has been written to the new backends yet zstor -c /etc/zstor-default.toml status From 8178063d8e2554450942abea6e812aa7f170c41c Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 14 Nov 2024 16:46:48 -0800 Subject: [PATCH 06/32] Add SSH key to vars example --- pulumi/vars.example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pulumi/vars.example.py b/pulumi/vars.example.py index 6bf2c2f..7dce8c5 100644 --- a/pulumi/vars.example.py +++ b/pulumi/vars.example.py @@ -1,6 +1,7 @@ # These can also be specified as env vars with the same names MNEMONIC = "your words here" NETWORK = "test" + # Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub SSH_KEY = "" From 6b05752f720692dcaecf4fcd28e1b3768f57d674 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 14 Nov 2024 16:47:59 -0800 Subject: [PATCH 07/32] Don't proceed until VM responds to ping --- pulumi/__main__.py | 6 ++++++ pulumi/util.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index faba56d..6e3b337 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -15,6 +15,10 @@ except ModuleNotFoundError: exit("vars.py not found. Exiting.") +# How many times we try pinging the VM after deployment before proceeding with +# SSH based commands. Total time to wait will be this times 10 seconds +PING_RETRIES = 10 + # Same for the base zstor config. Exit if the user didn't provide this ZSTOR_CONFIG_BASE = "zstor_config.base.toml" ZSTOR_CONFIG = "zstor_config.toml" @@ -191,6 +195,8 @@ def post_deploy(args): # ssh_ip = vm["mycelium_ip"] ssh_ip = vm["computed_ip6"].split("/")[0] + util.wait_for_host(ssh_ip, PING_RETRIES) + # Do we also need to wait here in case ping starts working before SSH? util.scp(ssh_ip, "zinit/", "/etc/") util.scp(ssh_ip, ZSTOR_CONFIG, ZSTOR_CONFIG_REMOTE) util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) diff --git a/pulumi/util.py b/pulumi/util.py index 3f11b3c..673ba06 100644 --- a/pulumi/util.py +++ b/pulumi/util.py @@ -72,3 +72,24 @@ def get_ssh_key_from_disk(): continue return ssh_key + + +def wait_for_host(host, max_retries=None): + """Ping a host until it responds or max retries is reached. Since the default timeout on Linux systems is typically 10 seconds, the total timeout will be that times the number of retries.""" + import subprocess + + retry_count = 0 + + while True: + if max_retries and retry_count >= max_retries: + return False + + try: + # Ping the host once + command = ["ping", "-c", "1", host] + subprocess.check_output(command) + return True + + except subprocess.CalledProcessError: + retry_count += 1 + continue From 6583cef6c96282ac36bf907862f3913cfe5777cd Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 14 Nov 2024 16:51:45 -0800 Subject: [PATCH 08/32] Add prometheus --- pulumi/__main__.py | 2 ++ pulumi/post_deploy.sh | 13 ++++++++++++- pulumi/prometheus.example.yaml | 16 ++++++++++++++++ pulumi/zinit/node-exporter.yaml | 1 + pulumi/zinit/prometheus.yaml | 1 + pulumi/zstor_config.base.example.toml | 2 +- 6 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 pulumi/prometheus.example.yaml create mode 100644 pulumi/zinit/node-exporter.yaml create mode 100644 pulumi/zinit/prometheus.yaml diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 6e3b337..cd30b95 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -199,6 +199,8 @@ def post_deploy(args): # Do we also need to wait here in case ping starts working before SSH? util.scp(ssh_ip, "zinit/", "/etc/") util.scp(ssh_ip, ZSTOR_CONFIG, ZSTOR_CONFIG_REMOTE) + if os.path.isfile("prometheus.yaml"): + util.scp(ssh_ip, "prometheus.yaml", "/etc/") util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) diff --git a/pulumi/post_deploy.sh b/pulumi/post_deploy.sh index 3622c9d..3e481d0 100644 --- a/pulumi/post_deploy.sh +++ b/pulumi/post_deploy.sh @@ -25,7 +25,18 @@ zinit monitor zstor zinit monitor zdb zinit monitor zdbfs +# If user didn't supply a prometheus config, then give the files a non "yaml" +# extension so zinit doesn't start them if the VM reboots +if [ -f /etc/prometheus.yaml ]; then + apt install -y prometheus + zinit monitor node-exporter + zinit monitor prometheus +else + mv /etc/zinit/prometheus.yaml /etc/zinit/prometheus.yaml.deactivated + mv /etc/zinit/node-exporter.yaml /etc/zinit/node-exporter.yaml.deactivated +fi + # Zdbfs will fail on first attempt because zdb isn't ready yet (could add a # test to zdb to fix this, maybe using redis-cli, nc, or ss) -sleep 1 +sleep 1 zinit diff --git a/pulumi/prometheus.example.yaml b/pulumi/prometheus.example.yaml new file mode 100644 index 0000000..3c13abc --- /dev/null +++ b/pulumi/prometheus.example.yaml @@ -0,0 +1,16 @@ +global: + scrape_interval: 60s + external_labels: + origin_prometheus: prometheus01 +remote_write: + - url: https://your-prometheus.url + basic_auth: + username: user + password: password +scrape_configs: + - job_name: zstor + static_configs: + - targets: ["localhost:9200"] + - job_name: node-exporter-zstor + static_configs: + - targets: ["localhost:9100"] diff --git a/pulumi/zinit/node-exporter.yaml b/pulumi/zinit/node-exporter.yaml new file mode 100644 index 0000000..1a466e1 --- /dev/null +++ b/pulumi/zinit/node-exporter.yaml @@ -0,0 +1 @@ +exec: prometheus-node-exporter diff --git a/pulumi/zinit/prometheus.yaml b/pulumi/zinit/prometheus.yaml new file mode 100644 index 0000000..3e07ecc --- /dev/null +++ b/pulumi/zinit/prometheus.yaml @@ -0,0 +1 @@ +exec: prometheus --config.file=/etc/prometheus.yaml diff --git a/pulumi/zstor_config.base.example.toml b/pulumi/zstor_config.base.example.toml index 76dad34..f088569 100644 --- a/pulumi/zstor_config.base.example.toml +++ b/pulumi/zstor_config.base.example.toml @@ -5,7 +5,7 @@ redundant_nodes = 0 root = "/" zdbfs_mountpoint = "/mnt/qsfs" socket = "/tmp/zstor.sock" -prometheus_port = 9100 +prometheus_port = 9200 zdb_data_dir_path = "/data/data/zdbfs-data/" max_zdb_data_dir_size = 2560 From 6c201490c809ee5c2e0d19017706df84d3d56bde Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 14 Nov 2024 17:00:29 -0800 Subject: [PATCH 09/32] Support using custom zstor binary --- pulumi/__main__.py | 2 ++ pulumi/post_deploy.sh | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index cd30b95..a5ddcf6 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -201,6 +201,8 @@ def post_deploy(args): util.scp(ssh_ip, ZSTOR_CONFIG, ZSTOR_CONFIG_REMOTE) if os.path.isfile("prometheus.yaml"): util.scp(ssh_ip, "prometheus.yaml", "/etc/") + if os.path.isfile("zstor"): + util.scp(ssh_ip, "zstor", "/usr/bin/") util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) diff --git a/pulumi/post_deploy.sh b/pulumi/post_deploy.sh index 3e481d0..426f8ac 100644 --- a/pulumi/post_deploy.sh +++ b/pulumi/post_deploy.sh @@ -6,10 +6,23 @@ set -x zinit | grep -q zstor && exit # Grab binaries and hook script. Make sure that all are executable -wget -O /usr/local/bin/zdbfs https://github.com/threefoldtech/0-db-fs/releases/download/v0.1.11/zdbfs-0.1.11-amd64-linux-static -wget -O /usr/local/bin/zdb https://github.com/threefoldtech/0-db/releases/download/v2.0.8/zdb-2.0.8-linux-amd64-static -wget -O /bin/zstor https://github.com/threefoldtech/0-stor_v2/releases/download/v0.4.0/zstor_v2-x86_64-linux-musl -wget -O /usr/local/bin/zdb-hook.sh https://raw.githubusercontent.com/threefoldtech/quantum-storage/master/lib/zdb-hook.sh +# We check first if the files exist, to support testing other builds by +# uploading them into the VM before running this script +if ! [ -f /usr/local/bin/zdbfs ]; then + wget -O /usr/local/bin/zdbfs https://github.com/threefoldtech/0-db-fs/releases/download/v0.1.11/zdbfs-0.1.11-amd64-linux-static +fi + +if ! [ -f /usr/local/bin/zdb ]; then + wget -O /usr/local/bin/zdb https://github.com/threefoldtech/0-db/releases/download/v2.0.8/zdb-2.0.8-linux-amd64-static +fi + +if ! [ -f /usr/local/bin/zdb-hook.sh ]; then + wget -O /usr/local/bin/zdb-hook.sh https://raw.githubusercontent.com/threefoldtech/quantum-storage/master/lib/zdb-hook.sh +fi + +if ! [ -f /bin/zstor ]; then + wget -O /bin/zstor https://github.com/threefoldtech/0-stor_v2/releases/download/v0.4.0/zstor_v2-x86_64-linux-musl +fi echo echo Setting permissions for downloaded binaries From c869fa4a557e80947ab43cd86f323402a36cfdde Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 19 Nov 2024 12:42:09 -0800 Subject: [PATCH 10/32] Use pulumi_command --- pulumi/README.md | 2 +- pulumi/__main__.py | 153 ++++++++++++++---- pulumi/scripts/activate_qsfs.sh | 40 +++++ pulumi/{post_deploy.sh => scripts/prep_vm.sh} | 31 +--- pulumi/util.py | 95 ----------- pulumi/vars.example.py | 10 +- 6 files changed, 170 insertions(+), 161 deletions(-) create mode 100644 pulumi/scripts/activate_qsfs.sh rename pulumi/{post_deploy.sh => scripts/prep_vm.sh} (55%) delete mode 100644 pulumi/util.py diff --git a/pulumi/README.md b/pulumi/README.md index ec690d3..94b265f 100644 --- a/pulumi/README.md +++ b/pulumi/README.md @@ -21,7 +21,7 @@ We need some Python packages to make this work. Using a venv is recommended. ``` python -m venv .venv source .venv/bin/activate -pip install pulumi pulumi_random pulumi_threefold +pip install pulumi pulumi_random pulumi_command pulumi_threefold ``` ## Prep config diff --git a/pulumi/__main__.py b/pulumi/__main__.py index a5ddcf6..a3bde09 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -5,13 +5,22 @@ import pulumi import pulumi_random +import pulumi_command import pulumi_threefold as threefold -import util - # It's up to the user to create their own vars.py before trying to deploy try: - import vars + from vars import ( + MNEMONIC, + NETWORK, + SSH_KEY_PATH, + VM_NODE, + META_NODES, + DATA_NODES, + DATA_SIZE, + ZDB_CONNECTION, + SSH_CONNECTION, + ) except ModuleNotFoundError: exit("vars.py not found. Exiting.") @@ -21,7 +30,7 @@ # Same for the base zstor config. Exit if the user didn't provide this ZSTOR_CONFIG_BASE = "zstor_config.base.toml" -ZSTOR_CONFIG = "zstor_config.toml" +ZSTOR_CONFIG_PATH = "zstor_config{}.toml" # This path is hard coded in the Zdb hook script ZSTOR_CONFIG_REMOTE = "/etc/zstor-default.toml" @@ -39,23 +48,23 @@ ZDB_MYC_INDEX = -1 # From here are all the parameters for the deployment -MNEMONIC = vars.MNEMONIC if vars.MNEMONIC else os.environ.get("MNEMONIC") -NETWORK = vars.NETWORK if vars.NETWORK else os.environ.get("NETWORK") +MNEMONIC = MNEMONIC if MNEMONIC else os.environ.get("MNEMONIC") +NETWORK = NETWORK if NETWORK else os.environ.get("NETWORK") + +ssh_key_path = os.path.expanduser(SSH_KEY_PATH) -SSH_KEY = vars.SSH_KEY if vars.SSH_KEY else util.get_ssh_key_from_disk() -if not SSH_KEY: - SSH_KEY = input("Please enter your public SSH key: ") +with open(ssh_key_path, "r") as file: + ssh_private_key = file.read() + +with open(ssh_key_path + ".pub", "r") as file: + ssh_public_key = file.read() -VM_NODE = vars.VM_NODE FLIST = "https://hub.grid.tf/tf-official-apps/threefoldtech-ubuntu-22.04.flist" CPU = 1 RAM = 2048 # MB ROOTFS = 1024 * 15 # MB NET_NAME = "net" -META_NODES = vars.META_NODES -DATA_NODES = vars.DATA_NODES -DATA_SIZE = vars.DATA_SIZE META_SIZE = 1 # Generate separate secrets for Zstor key and Zdb namespaces passwords @@ -64,9 +73,9 @@ zstor_key = pulumi_random.RandomBytes("zstor_key", length=32) zdb_pw = pulumi_random.RandomPassword("zdb_pw", length=20) -if vars.ZDB_CONNECTION == "ipv6": +if ZDB_CONNECTION == "ipv6": ZDB_IP_INDEX = ZDB_IP6_INDEX -elif vars.ZDB_CONNECTION == "mycelium": +elif ZDB_CONNECTION == "mycelium": ZDB_IP_INDEX = ZDB_MYC_INDEX provider = threefold.Provider( @@ -111,7 +120,7 @@ planetary=True, public_ip6=True, env_vars={ - "SSH_KEY": SSH_KEY, + "SSH_KEY": ssh_public_key, }, ) ) @@ -146,10 +155,32 @@ ) -def post_deploy(args): - # TODO: Don't overwrite existing file if it's there - # Actually, maybe it's okay as long as we have the secrets persisted - shutil.copy(ZSTOR_CONFIG_BASE, ZSTOR_CONFIG) +def make_ssh_connection(vm): + if SSH_CONNECTION == "mycelium": + ssh_ip = vm["mycelium_ip"] + elif SSH_CONNECTION == "ipv6": + # This allows us to call this from either inside an apply or outside + if isinstance(vm["computed_ip6"], str): + ssh_ip = vm["computed_ip6"].split("/")[0] + else: + ssh_ip = vm["computed_ip6"].apply(lambda ip6: ip6.split("/")[0]) + + return pulumi_command.remote.ConnectionArgs( + host=ssh_ip, + user="root", + private_key=ssh_private_key, + ) + + +def make_zstor_config(args): + # Changes to the zdb backends mean that we need to regenerate the config + # file. Here we always choose a new local path and leave the old files + # around just in case + i = 1 + while os.path.exists(path := ZSTOR_CONFIG_PATH.format(i)): + i += 1 + + shutil.copy(ZSTOR_CONFIG_BASE, path) meta_zdbs = [] data_zdbs = [] @@ -165,7 +196,7 @@ def post_deploy(args): meta_zdbs = sorted(meta_zdbs, key=lambda z: z["namespace"].split("-")[-1]) data_zdbs = sorted(data_zdbs, key=lambda z: z["namespace"].split("-")[-1]) - with open(ZSTOR_CONFIG, "a") as file: + with open(path, "a") as file: encryption_config = f""" [encryption] algorithm = "AES" @@ -193,25 +224,79 @@ def post_deploy(args): file.write(f'namespace = "{ns}"\n') file.write(f'password = "{args['zdb_pw']}"\n\n') - # ssh_ip = vm["mycelium_ip"] - ssh_ip = vm["computed_ip6"].split("/")[0] - util.wait_for_host(ssh_ip, PING_RETRIES) - # Do we also need to wait here in case ping starts working before SSH? - util.scp(ssh_ip, "zinit/", "/etc/") - util.scp(ssh_ip, ZSTOR_CONFIG, ZSTOR_CONFIG_REMOTE) - if os.path.isfile("prometheus.yaml"): - util.scp(ssh_ip, "prometheus.yaml", "/etc/") - if os.path.isfile("zstor"): - util.scp(ssh_ip, "zstor", "/usr/bin/") - util.run_script_ssh(ssh_ip, POST_DEPLOY_SCRIPT) + # We have to do this inside the function, because it's not possible to use + # an Output to specify the path to a FileAsset, and any value we return + # from this function becomes an Output + pulumi_command.remote.CopyToRemote( + "copy_zstor_config", + connection=make_ssh_connection(vm), + source=pulumi.FileAsset(path), + remote_path=ZSTOR_CONFIG_REMOTE, + triggers=[conn.host], + ) -pulumi.Output.all( +zstor_config = pulumi.Output.all( deployments=[(d.vms_computed, d.zdbs_computed) for d in deployments.values()], zstor_key=zstor_key.hex, zdb_pw=zdb_pw.result, -).apply(post_deploy) +).apply(make_zstor_config) vm = deployments[VM_NODE].vms_computed[0] +conn = make_ssh_connection(vm) + +if os.path.isfile("prometheus.yaml"): + pulumi_command.remote.CopyToRemote( + "copy_prometheus", + connection=conn, + source=pulumi.FileAsset("prometheus.yaml"), + remote_path="/etc/prometheus.yaml", + triggers=[conn.host], + ) + +# In case we want to test our own zstor binary, such as a prebuild +if os.path.isfile("zstor"): + pulumi_command.remote.CopyToRemote( + "copy_zstor_binary", + connection=conn, + source=pulumi.FileAsset("zstor"), + remote_path="/usr/bin/zstor", + triggers=[conn.host], + ) + +# We put the zinit files under /root to start, so that the services don't get +# started accidentally on reboot. In the case of recovering on a new VM, we +# need to ensure some other steps are completed first +copy_zinit = pulumi_command.remote.CopyToRemote( + "copy_zinit", + connection=conn, + source=pulumi.FileArchive("zinit/"), + remote_path="/root/zinit/", + triggers=[conn.host], +) + +copy_scripts = pulumi_command.remote.CopyToRemote( + "copy_scripts", + connection=conn, + source=pulumi.FileArchive("scripts/"), + remote_path="/root/scripts/", + triggers=[conn.host], +) + +prep_vm = pulumi_command.remote.Command( + "prep_vm", + connection=conn, + create="bash /root/scripts/prep_vm.sh |& tee > /var/log/prep_vm.log", + triggers=[conn.host], + opts=pulumi.ResourceOptions(depends_on=[copy_scripts]), +) + +pulumi_command.remote.Command( + "activate_qsfs", + connection=conn, + create="bash /root/scripts/activate_qsfs.sh |& tee > /var/log/activate_qsfs.log", + opts=pulumi.ResourceOptions(depends_on=[prep_vm, copy_zinit]), +) + pulumi.export("mycelium_ip", vm.mycelium_ip) pulumi.export("pub_ipv6", vm.computed_ip6) diff --git a/pulumi/scripts/activate_qsfs.sh b/pulumi/scripts/activate_qsfs.sh new file mode 100644 index 0000000..3cd1888 --- /dev/null +++ b/pulumi/scripts/activate_qsfs.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# This script starts up the qsfs, ensuring the mount point exists + +set -x + +# Primitive idempotency +zinit | grep -q zstor && exit + +echo +echo Creating Zdbfs mountpoint +mkdir -p /mnt/qsfs + +echo +echo Copying zinit service files +cp /root/zinit/zstor.yaml /etc/zinit +cp /root/zinit/zdb.yaml /etc/zinit +cp /root/zinit/zdbfs.yaml /etc/zinit + +echo +echo Starting up zinit services +zinit monitor zstor +zinit monitor zdb +zinit monitor zdbfs + +if [ -f /etc/prometheus.yaml ]; then + echo + echo Installing Prometheus + apt install -y prometheus + + echo + echo Copying Prometheus zinit service files + cp /root/zinit/node-exporter.yaml /etc/zinit + cp /root/zinit/prometheus.yaml /etc/zinit + + echo + echo Starting up Prometheus zinit services + zinit monitor node-exporter + zinit monitor prometheus +fi diff --git a/pulumi/post_deploy.sh b/pulumi/scripts/prep_vm.sh similarity index 55% rename from pulumi/post_deploy.sh rename to pulumi/scripts/prep_vm.sh index 426f8ac..bba3850 100644 --- a/pulumi/post_deploy.sh +++ b/pulumi/scripts/prep_vm.sh @@ -1,9 +1,8 @@ #!/bin/bash -set -x +# This script installs all binaries and scripts needed for QSFS. It doesn't actually start up the services though -# Primitive idempotency -zinit | grep -q zstor && exit +set -x # Grab binaries and hook script. Make sure that all are executable # We check first if the files exist, to support testing other builds by @@ -27,29 +26,3 @@ fi echo echo Setting permissions for downloaded binaries chmod +x /usr/local/bin/* /bin/zstor - -echo -echo Creating Zdbfs mountpoint -mkdir -p /mnt/qsfs - -echo -echo Starting up zinit services -zinit monitor zstor -zinit monitor zdb -zinit monitor zdbfs - -# If user didn't supply a prometheus config, then give the files a non "yaml" -# extension so zinit doesn't start them if the VM reboots -if [ -f /etc/prometheus.yaml ]; then - apt install -y prometheus - zinit monitor node-exporter - zinit monitor prometheus -else - mv /etc/zinit/prometheus.yaml /etc/zinit/prometheus.yaml.deactivated - mv /etc/zinit/node-exporter.yaml /etc/zinit/node-exporter.yaml.deactivated -fi - -# Zdbfs will fail on first attempt because zdb isn't ready yet (could add a -# test to zdb to fix this, maybe using redis-cli, nc, or ss) -sleep 1 -zinit diff --git a/pulumi/util.py b/pulumi/util.py deleted file mode 100644 index 673ba06..0000000 --- a/pulumi/util.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import subprocess - - -def run_script_ssh(ip, script): - counter = 1 - while True: - log_filename = f"ssh.{counter}.log" - try: - with open(log_filename, "x") as logfile: - with open(script, "r") as scriptfile: - script_contents = scriptfile.read() - subprocess.run( - [ - "ssh", - "-oStrictHostKeyChecking=accept-new", - "-oConnectionAttempts=5", - "root@" + ip, - "bash", - " -s", - ], - input=script_contents, - text=True, - stdout=logfile, - stderr=logfile, - ) - break - except FileExistsError: - counter += 1 - - -def scp(ip, source, destination): - # Meant for ipv6 - counter = 1 - while True: - log_filename = f"scp.{counter}.log" - try: - with open(log_filename, "x") as logfile: - subprocess.run( - [ - "scp", - "-r", - "-oStrictHostKeyChecking=accept-new", - "-oConnectionAttempts=5", - source, - f"root@[{ip}]:{destination}", - ], - stdout=logfile, - stderr=logfile, - ) - break - except FileExistsError: - counter += 1 - - -def get_ssh_key_from_disk(): - key_paths = [ - os.path.expanduser("~/.ssh/id_rsa.pub"), - os.path.expanduser("~/.ssh/id_ed25519.pub"), - os.path.expanduser("~/.ssh/id_ecdsa.pub"), - os.path.expanduser("~/.ssh/id_dsa.pub"), - ] - - ssh_key = None - - for path in key_paths: - try: - with open(path) as file: - ssh_key = file.read() - break - except (FileNotFoundError, OSError): - continue - - return ssh_key - - -def wait_for_host(host, max_retries=None): - """Ping a host until it responds or max retries is reached. Since the default timeout on Linux systems is typically 10 seconds, the total timeout will be that times the number of retries.""" - import subprocess - - retry_count = 0 - - while True: - if max_retries and retry_count >= max_retries: - return False - - try: - # Ping the host once - command = ["ping", "-c", "1", host] - subprocess.check_output(command) - return True - - except subprocess.CalledProcessError: - retry_count += 1 - continue diff --git a/pulumi/vars.example.py b/pulumi/vars.example.py index 7dce8c5..bb759c3 100644 --- a/pulumi/vars.example.py +++ b/pulumi/vars.example.py @@ -2,8 +2,10 @@ MNEMONIC = "your words here" NETWORK = "test" -# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub -SSH_KEY = "" +# In order to run commands on the deployed VM, we need both the public and +# private key files available. This takes the path to the private key file, and +# there should be a matching .pub file +SSH_KEY_PATH = "~/.ssh/id_rsa" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter VM_NODE = 5 @@ -18,3 +20,7 @@ # Network used to connect to the backend zdbs # ZDB_CONNECTION = "mycelium" ZDB_CONNECTION = "ipv6" + +# Network used for SSH connection +# SSH_CONNECTION = "mycelium" +SSH_CONNECTION = "ipv6" From 4940c4e04757d2950ca1c2ba1586fbcde002b60c Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 19 Nov 2024 18:10:15 -0800 Subject: [PATCH 11/32] Improve deployment flow --- pulumi/__main__.py | 87 +++++++++++++++++++-------------- pulumi/scripts/activate_qsfs.sh | 4 -- pulumi/scripts/prep_vm.sh | 7 +++ 3 files changed, 58 insertions(+), 40 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index a3bde09..9aabf7d 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -90,8 +90,9 @@ description="A network", nodes=[VM_NODE], ip_range="10.1.0.0/16", - # TODO: for some reason the mycelium keys seem to get regenerated if we update the deployment (like replacing zdbs) - mycelium=True, + # With mycelium enabled, we can't redeploy the vm + # https://github.com/threefoldtech/pulumi-threefold/issues/552 + # mycelium=True, opts=pulumi.ResourceOptions(provider=provider), ) @@ -100,7 +101,7 @@ deployments = {} for node in nodes: - net_name = "" + net_name = NET_NAME vms = [] depends = [] if node == VM_NODE: @@ -116,7 +117,7 @@ cpu=CPU, memory=RAM, rootfs_size=ROOTFS, - mycelium=True, + # mycelium=True, planetary=True, public_ip6=True, env_vars={ @@ -159,11 +160,7 @@ def make_ssh_connection(vm): if SSH_CONNECTION == "mycelium": ssh_ip = vm["mycelium_ip"] elif SSH_CONNECTION == "ipv6": - # This allows us to call this from either inside an apply or outside - if isinstance(vm["computed_ip6"], str): - ssh_ip = vm["computed_ip6"].split("/")[0] - else: - ssh_ip = vm["computed_ip6"].apply(lambda ip6: ip6.split("/")[0]) + ssh_ip = vm["computed_ip6"].apply(lambda ip6: ip6.split("/")[0]) return pulumi_command.remote.ConnectionArgs( host=ssh_ip, @@ -224,19 +221,19 @@ def make_zstor_config(args): file.write(f'namespace = "{ns}"\n') file.write(f'password = "{args['zdb_pw']}"\n\n') - # We have to do this inside the function, because it's not possible to use - # an Output to specify the path to a FileAsset, and any value we return - # from this function becomes an Output - pulumi_command.remote.CopyToRemote( - "copy_zstor_config", - connection=make_ssh_connection(vm), - source=pulumi.FileAsset(path), - remote_path=ZSTOR_CONFIG_REMOTE, - triggers=[conn.host], - ) + # This way the current file is always in the same place and we get around + # the fact that it's not possible to return a path from this function and + # use it as a FileAsset, because you can't pass an Output to FileAsset + shutil.copy(path, ZSTOR_CONFIG_PATH.format("")) + + # TODO: check if the new file is actually different than the previous one + # and if not, delete it. I guess we could have some better logic to + # actually detect if the zdbs have changed, but we still need to do the bit + # below to copy the file to the VM whenever we replace the VM, even if the + # config file is the same -zstor_config = pulumi.Output.all( +pulumi.Output.all( deployments=[(d.vms_computed, d.zdbs_computed) for d in deployments.values()], zstor_key=zstor_key.hex, zdb_pw=zdb_pw.result, @@ -244,24 +241,38 @@ def make_zstor_config(args): vm = deployments[VM_NODE].vms_computed[0] conn = make_ssh_connection(vm) +depends = [] + +copy_zstor_config = pulumi_command.remote.CopyToRemote( + "copy_zstor_config", + connection=conn, + source=pulumi.FileAsset(ZSTOR_CONFIG_PATH.format("")), + remote_path=ZSTOR_CONFIG_REMOTE, + triggers=[conn.host], +) + if os.path.isfile("prometheus.yaml"): - pulumi_command.remote.CopyToRemote( - "copy_prometheus", - connection=conn, - source=pulumi.FileAsset("prometheus.yaml"), - remote_path="/etc/prometheus.yaml", - triggers=[conn.host], + depends.append( + pulumi_command.remote.CopyToRemote( + "copy_prometheus", + connection=conn, + source=pulumi.FileAsset("prometheus.yaml"), + remote_path="/etc/prometheus.yaml", + triggers=[conn.host], + ) ) # In case we want to test our own zstor binary, such as a prebuild if os.path.isfile("zstor"): - pulumi_command.remote.CopyToRemote( - "copy_zstor_binary", - connection=conn, - source=pulumi.FileAsset("zstor"), - remote_path="/usr/bin/zstor", - triggers=[conn.host], + depends.append( + pulumi_command.remote.CopyToRemote( + "copy_zstor_binary", + connection=conn, + source=pulumi.FileAsset("zstor"), + remote_path="/usr/bin/zstor", + triggers=[conn.host], + ) ) # We put the zinit files under /root to start, so that the services don't get @@ -283,19 +294,23 @@ def make_zstor_config(args): triggers=[conn.host], ) +depends.append(copy_scripts) + prep_vm = pulumi_command.remote.Command( "prep_vm", connection=conn, - create="bash /root/scripts/prep_vm.sh |& tee > /var/log/prep_vm.log", + create="bash /root/scripts/prep_vm.sh 2>&1 | tee > /var/log/prep_vm.log", triggers=[conn.host], - opts=pulumi.ResourceOptions(depends_on=[copy_scripts]), + opts=pulumi.ResourceOptions(depends_on=depends), ) +depends.extend([prep_vm, copy_zinit, copy_zstor_config]) pulumi_command.remote.Command( "activate_qsfs", connection=conn, - create="bash /root/scripts/activate_qsfs.sh |& tee > /var/log/activate_qsfs.log", - opts=pulumi.ResourceOptions(depends_on=[prep_vm, copy_zinit]), + create="bash /root/scripts/activate_qsfs.sh 2>&1 | tee > /var/log/activate_qsfs.log", + update="", + opts=pulumi.ResourceOptions(depends_on=depends), ) pulumi.export("mycelium_ip", vm.mycelium_ip) diff --git a/pulumi/scripts/activate_qsfs.sh b/pulumi/scripts/activate_qsfs.sh index 3cd1888..6ffda12 100644 --- a/pulumi/scripts/activate_qsfs.sh +++ b/pulumi/scripts/activate_qsfs.sh @@ -24,10 +24,6 @@ zinit monitor zdb zinit monitor zdbfs if [ -f /etc/prometheus.yaml ]; then - echo - echo Installing Prometheus - apt install -y prometheus - echo echo Copying Prometheus zinit service files cp /root/zinit/node-exporter.yaml /etc/zinit diff --git a/pulumi/scripts/prep_vm.sh b/pulumi/scripts/prep_vm.sh index bba3850..42fa3c5 100644 --- a/pulumi/scripts/prep_vm.sh +++ b/pulumi/scripts/prep_vm.sh @@ -26,3 +26,10 @@ fi echo echo Setting permissions for downloaded binaries chmod +x /usr/local/bin/* /bin/zstor + +if [ -f /etc/prometheus.yaml ]; then + echo + echo Installing Prometheus + apt update + apt install -y prometheus +fi From cf78f9dd6d6d391830d303d7834b86758462fcf9 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 19 Nov 2024 18:15:47 -0800 Subject: [PATCH 12/32] Add recover script --- pulumi/scripts/recover.sh | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 pulumi/scripts/recover.sh diff --git a/pulumi/scripts/recover.sh b/pulumi/scripts/recover.sh new file mode 100644 index 0000000..47b0c5d --- /dev/null +++ b/pulumi/scripts/recover.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Start zstor +cp /root/zinit/zstor.yaml /etc/zinit +zinit monitor zstor + +# Recover the (empty) temp namespace +# zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-temp/zdb-namespace +# zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-temp/i0 +# zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-temp/d0 + +# Can't remember why I was trying to recover the temp namespace +# I think the hook ignores it anyway, so we can just start fresh + +apt update && apt install redis +redis-cli -p 9900 NSNEW zdbfs-temp +redis-cli -p 9900 NSSET zdbfs-temp password hello +redis-cli -p 9900 NSSET zdbfs-temp public 0 +redis-cli -p 9900 NSSET zdbfs-temp mode seq + +# Recover meta data index and (empty) working data file +zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-meta/zdb-namespace +i=0 +while true; do + result=$(zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-meta/i$i 2>&1) + if echo $result | grep -q error + then break + fi + i=$((i+1)) +done + +last_meta_index=$(ls /data/index/zdbfs-meta | tr -d i | sort -n | tail -n 1) +zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-meta/d$last_meta_index + +# Recover data index and (empty) working file +zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-data/zdb-namespace +i=0 +while true; do + result=$(zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-data/i$i 2>&1) + if echo $result | grep -q error + then break + fi + i=$((i+1)) +done + +last_data_index=$(ls /data/index/zdbfs-data | tr -d i | sort -n | tail -n 1) +zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-data/d$last_data_index + + +# Start zdb and zdbfs + +cp /root/zinit/* /etc/zinit +zinit monitor zdb +zinit monitor zdbfs From f3a7824930a8cba6cbdca984fe5d20c19b7edcbd Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 20 Nov 2024 18:40:45 -0800 Subject: [PATCH 13/32] Fix recover script --- pulumi/scripts/recover.sh | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/pulumi/scripts/recover.sh b/pulumi/scripts/recover.sh index 47b0c5d..3824a56 100644 --- a/pulumi/scripts/recover.sh +++ b/pulumi/scripts/recover.sh @@ -1,24 +1,31 @@ #!/bin/bash -# Start zstor -cp /root/zinit/zstor.yaml /etc/zinit -zinit monitor zstor +# This script is for recovering an existing QSFS onto a new VM + +echo +echo "Creating QSFS mount point at /mnt/qsfs..." +mkdir -p /mnt/qsfs -# Recover the (empty) temp namespace -# zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-temp/zdb-namespace -# zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-temp/i0 -# zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-temp/d0 +echo +echo "Starting zstor and zdb services..." +cp /root/zinit/* /etc/zinit +zinit monitor zstor +zinit monitor zdb -# Can't remember why I was trying to recover the temp namespace -# I think the hook ignores it anyway, so we can just start fresh +# The temp namespace is not backed up, so we just create it manually +echo +echo "Installing redis-cli..." +apt update && apt install -y redis -apt update && apt install redis +echo +echo "Setting up temp namespace..." redis-cli -p 9900 NSNEW zdbfs-temp redis-cli -p 9900 NSSET zdbfs-temp password hello redis-cli -p 9900 NSSET zdbfs-temp public 0 redis-cli -p 9900 NSSET zdbfs-temp mode seq -# Recover meta data index and (empty) working data file +echo +echo "Recovering metadata indexes..." zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-meta/zdb-namespace i=0 while true; do @@ -29,10 +36,13 @@ while true; do i=$((i+1)) done +echo +echo "Retrieving latest metadata data file..." last_meta_index=$(ls /data/index/zdbfs-meta | tr -d i | sort -n | tail -n 1) zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-meta/d$last_meta_index -# Recover data index and (empty) working file +echo +echo "Recovering data indexes..." zstor -c /etc/zstor-default.toml retrieve --file /data/index/zdbfs-data/zdb-namespace i=0 while true; do @@ -43,12 +53,13 @@ while true; do i=$((i+1)) done +echo +echo "Retrieving latest data data file..." last_data_index=$(ls /data/index/zdbfs-data | tr -d i | sort -n | tail -n 1) zstor -c /etc/zstor-default.toml retrieve --file /data/data/zdbfs-data/d$last_data_index -# Start zdb and zdbfs - -cp /root/zinit/* /etc/zinit -zinit monitor zdb +# Start zdbfs +echo +echo "Starting ZDBFS service..." zinit monitor zdbfs From 848e8394afcc7545a1d282acb5b62b5e075028b1 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 20 Nov 2024 18:42:06 -0800 Subject: [PATCH 14/32] Improving test --- pulumi/tests/rebuild/post_deploy.sh | 1 - pulumi/tests/rebuild/run.sh | 8 +++---- .../{scripts => test-scripts}/deploy.sh | 0 .../{scripts => test-scripts}/destroy.sh | 0 .../rebuild/test-scripts/rebuild_original.sh | 22 +++++++++++++++++++ .../{scripts => test-scripts}/redeploy.sh | 4 ++-- .../{scripts => test-scripts}/write_data.sh | 18 ++++++++++++--- pulumi/tests/rebuild/zinit | 1 - pulumi/tests/rebuild/zinit/node-exporter.yaml | 1 + pulumi/tests/rebuild/zinit/prometheus.yaml | 1 + pulumi/tests/rebuild/zinit/zdb.yaml | 10 +++++++++ pulumi/tests/rebuild/zinit/zdbfs.yaml | 2 ++ pulumi/tests/rebuild/zinit/zstor.yaml | 6 +++++ 13 files changed, 63 insertions(+), 11 deletions(-) delete mode 120000 pulumi/tests/rebuild/post_deploy.sh rename pulumi/tests/rebuild/{scripts => test-scripts}/deploy.sh (100%) rename pulumi/tests/rebuild/{scripts => test-scripts}/destroy.sh (100%) create mode 100644 pulumi/tests/rebuild/test-scripts/rebuild_original.sh rename pulumi/tests/rebuild/{scripts => test-scripts}/redeploy.sh (78%) rename pulumi/tests/rebuild/{scripts => test-scripts}/write_data.sh (63%) delete mode 120000 pulumi/tests/rebuild/zinit create mode 100644 pulumi/tests/rebuild/zinit/node-exporter.yaml create mode 100644 pulumi/tests/rebuild/zinit/prometheus.yaml create mode 100644 pulumi/tests/rebuild/zinit/zdb.yaml create mode 100644 pulumi/tests/rebuild/zinit/zdbfs.yaml create mode 100644 pulumi/tests/rebuild/zinit/zstor.yaml diff --git a/pulumi/tests/rebuild/post_deploy.sh b/pulumi/tests/rebuild/post_deploy.sh deleted file mode 120000 index 149f4fd..0000000 --- a/pulumi/tests/rebuild/post_deploy.sh +++ /dev/null @@ -1 +0,0 @@ -../../post_deploy.sh \ No newline at end of file diff --git a/pulumi/tests/rebuild/run.sh b/pulumi/tests/rebuild/run.sh index ac492e7..7ae1d91 100644 --- a/pulumi/tests/rebuild/run.sh +++ b/pulumi/tests/rebuild/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -scripts/deploy.sh -scripts/write_data.sh -scripts/redeploy.sh -scripts/destroy.sh +test-scripts/deploy.sh +test-scripts/write_data.sh +test-scripts/redeploy.sh +test-scripts/destroy.sh diff --git a/pulumi/tests/rebuild/scripts/deploy.sh b/pulumi/tests/rebuild/test-scripts/deploy.sh similarity index 100% rename from pulumi/tests/rebuild/scripts/deploy.sh rename to pulumi/tests/rebuild/test-scripts/deploy.sh diff --git a/pulumi/tests/rebuild/scripts/destroy.sh b/pulumi/tests/rebuild/test-scripts/destroy.sh similarity index 100% rename from pulumi/tests/rebuild/scripts/destroy.sh rename to pulumi/tests/rebuild/test-scripts/destroy.sh diff --git a/pulumi/tests/rebuild/test-scripts/rebuild_original.sh b/pulumi/tests/rebuild/test-scripts/rebuild_original.sh new file mode 100644 index 0000000..5ac824e --- /dev/null +++ b/pulumi/tests/rebuild/test-scripts/rebuild_original.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +echo -e "\n===== Removing local data files and reconstructin from backends =====" + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 ' + rm /data/data/zdbfs-data/* + for i in {1..10}; do + md5sum file$i.dat + done +' > md5s_new + +diff md5_original md5_new + +if cmp -s md5_original md5_new; then + echo -e "\n===== Hashes match after rebuild, success =====" +else + echo -e "\n===== Hashes differ after rebuild, failure ====" diff --git a/pulumi/tests/rebuild/scripts/redeploy.sh b/pulumi/tests/rebuild/test-scripts/redeploy.sh similarity index 78% rename from pulumi/tests/rebuild/scripts/redeploy.sh rename to pulumi/tests/rebuild/test-scripts/redeploy.sh index 1a56b46..42cb55e 100755 --- a/pulumi/tests/rebuild/scripts/redeploy.sh +++ b/pulumi/tests/rebuild/test-scripts/redeploy.sh @@ -14,6 +14,6 @@ ssh -t root@$ipv6 ' pkill zstor -SIGUSR1 # Wait some time to let the rebuild process start. This should be enough? sleep 10 - # Output should show us if any data has been written to the new backends yet - zstor -c /etc/zstor-default.toml status ' + +# TODO: Need to figure out a way to check whether rebuilding has succeeded. diff --git a/pulumi/tests/rebuild/scripts/write_data.sh b/pulumi/tests/rebuild/test-scripts/write_data.sh similarity index 63% rename from pulumi/tests/rebuild/scripts/write_data.sh rename to pulumi/tests/rebuild/test-scripts/write_data.sh index 9cad9c9..775b2fd 100755 --- a/pulumi/tests/rebuild/scripts/write_data.sh +++ b/pulumi/tests/rebuild/test-scripts/write_data.sh @@ -12,13 +12,17 @@ ssh -t root@$ipv6 ' echo "Creating file$i.dat..." dd if=/dev/urandom of=file$i.dat bs=1M count=100 done +' +ssh -t root@$ipv6 ' echo -e "\n===== Calculating MD5 checksums of source files =====" # Calculate and print MD5 sum for each file for i in {1..10}; do md5sum file$i.dat done +' > md5s_original +ssh -t root@$ipv6 ' echo -e "\n===== Installing pv tool for transfer monitoring =====" apt update &> /dev/null && apt install -y pv &> /dev/null @@ -29,7 +33,15 @@ ssh -t root@$ipv6 ' pv -s 100m "file$i.dat" > "/mnt/qsfs/file$i.dat" done - echo -e "\n===== Checking zstor backend status =====" - # Print zstor status to give an idea of whether data was uploaded to backends - zstor -c /etc/zstor-default.toml status + echo -e "\n===== Waiting for all data files to upload =====" + # Here we are taking advantage of the fact that there is a 10 second delay + # before the last data file gets rotated, as specified in zinit/zdb.yaml. + # After the rotation, there will be a new file with a higher index number + # that is not uploaded to zstor, but we do not care about that file since it + # will have no data + for file in /data/data/zdbfs-data/*; do + while ! zstor -c /etc/zstor-default.toml check "$file"; do + sleep 2 + done + done ' diff --git a/pulumi/tests/rebuild/zinit b/pulumi/tests/rebuild/zinit deleted file mode 120000 index ed52882..0000000 --- a/pulumi/tests/rebuild/zinit +++ /dev/null @@ -1 +0,0 @@ -../../zinit/ \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/node-exporter.yaml b/pulumi/tests/rebuild/zinit/node-exporter.yaml new file mode 100644 index 0000000..1a466e1 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/node-exporter.yaml @@ -0,0 +1 @@ +exec: prometheus-node-exporter diff --git a/pulumi/tests/rebuild/zinit/prometheus.yaml b/pulumi/tests/rebuild/zinit/prometheus.yaml new file mode 100644 index 0000000..3e07ecc --- /dev/null +++ b/pulumi/tests/rebuild/zinit/prometheus.yaml @@ -0,0 +1 @@ +exec: prometheus --config.file=/etc/prometheus.yaml diff --git a/pulumi/tests/rebuild/zinit/zdb.yaml b/pulumi/tests/rebuild/zinit/zdb.yaml new file mode 100644 index 0000000..9e56d12 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zdb.yaml @@ -0,0 +1,10 @@ +exec: | + /usr/local/bin/zdb \ + --index /data/index \ + --data /data/data \ + --logfile /var/log/zdb.log \ + --datasize 67108864 \ + --hook /usr/local/bin/zdb-hook.sh \ + --rotate 10 +shutdown_timeout: 60 +after: [zstor] diff --git a/pulumi/tests/rebuild/zinit/zdbfs.yaml b/pulumi/tests/rebuild/zinit/zdbfs.yaml new file mode 100644 index 0000000..c999d7d --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zdbfs.yaml @@ -0,0 +1,2 @@ +exec: /usr/local/bin/zdbfs /mnt/qsfs -o autons +after: [zdb] diff --git a/pulumi/tests/rebuild/zinit/zstor.yaml b/pulumi/tests/rebuild/zinit/zstor.yaml new file mode 100644 index 0000000..0f1a98f --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zstor.yaml @@ -0,0 +1,6 @@ +exec: | + /bin/zstor \ + -c /etc/zstor-default.toml \ + --log_file /var/log/zstor.log \ + monitor +shutdown_timeout: 300 From f518ef1d2f54e9463f48332f7a4605b4778d9e5b Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 20 Nov 2024 18:49:59 -0800 Subject: [PATCH 15/32] Update README --- pulumi/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pulumi/README.md b/pulumi/README.md index 94b265f..c709c22 100644 --- a/pulumi/README.md +++ b/pulumi/README.md @@ -55,3 +55,17 @@ If you want to destroy the deployment, bring it down like this: ``` pulumi down ``` + +## Recover to new VM + +If you need to replace the frontend VM for any reason, such as a node outage, follow these steps. Any data that has been uploaded to the backends can be recovered into the new VM. Any data that was not yet uploaded to the backends will be lost. + +1. Update the `vars.py` file and set `VM_NODE` to the new node id +2. Destroy the old VM and deploy the new VM by running `pulumi up` +3. SSH to the new VM and run the recovery script: + +``` +bash /root/scripts/recover.sh +``` + +If all went well, your files should appear under the mount point, `/mnt/qsfs`. From 402ab898a3ae81c46a526c01bd952291ec8de779 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 21 Nov 2024 12:32:49 -0800 Subject: [PATCH 16/32] Change config numbering and clean unused code --- pulumi/__main__.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 9aabf7d..c6f67d7 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -24,13 +24,9 @@ except ModuleNotFoundError: exit("vars.py not found. Exiting.") -# How many times we try pinging the VM after deployment before proceeding with -# SSH based commands. Total time to wait will be this times 10 seconds -PING_RETRIES = 10 - # Same for the base zstor config. Exit if the user didn't provide this ZSTOR_CONFIG_BASE = "zstor_config.base.toml" -ZSTOR_CONFIG_PATH = "zstor_config{}.toml" +ZSTOR_CONFIG_PATH = "zstor_config.toml" # This path is hard coded in the Zdb hook script ZSTOR_CONFIG_REMOTE = "/etc/zstor-default.toml" @@ -174,7 +170,7 @@ def make_zstor_config(args): # file. Here we always choose a new local path and leave the old files # around just in case i = 1 - while os.path.exists(path := ZSTOR_CONFIG_PATH.format(i)): + while os.path.exists(path := ZSTOR_CONFIG_PATH + "." + str(i)): i += 1 shutil.copy(ZSTOR_CONFIG_BASE, path) @@ -224,7 +220,7 @@ def make_zstor_config(args): # This way the current file is always in the same place and we get around # the fact that it's not possible to return a path from this function and # use it as a FileAsset, because you can't pass an Output to FileAsset - shutil.copy(path, ZSTOR_CONFIG_PATH.format("")) + shutil.copy(path, ZSTOR_CONFIG_PATH) # TODO: check if the new file is actually different than the previous one # and if not, delete it. I guess we could have some better logic to @@ -246,7 +242,7 @@ def make_zstor_config(args): copy_zstor_config = pulumi_command.remote.CopyToRemote( "copy_zstor_config", connection=conn, - source=pulumi.FileAsset(ZSTOR_CONFIG_PATH.format("")), + source=pulumi.FileAsset(ZSTOR_CONFIG_PATH), remote_path=ZSTOR_CONFIG_REMOTE, triggers=[conn.host], ) From 3572bafe04ea790dd61217fa8734dc7821f52153 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 21 Nov 2024 12:41:01 -0800 Subject: [PATCH 17/32] Remove dead link --- pulumi/tests/rebuild/util.py | 1 - 1 file changed, 1 deletion(-) delete mode 120000 pulumi/tests/rebuild/util.py diff --git a/pulumi/tests/rebuild/util.py b/pulumi/tests/rebuild/util.py deleted file mode 120000 index 2174c92..0000000 --- a/pulumi/tests/rebuild/util.py +++ /dev/null @@ -1 +0,0 @@ -../../util.py \ No newline at end of file From 6c0743b4238185143dee1b39579add9b557806dd Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 21 Nov 2024 19:16:27 -0800 Subject: [PATCH 18/32] Reorg test scripts --- pulumi/tests/test-scripts-local/README.md | 1 + pulumi/tests/test-scripts-local/deploy.sh | 9 +++++++ pulumi/tests/test-scripts-local/destroy.sh | 7 +++++ .../test-scripts-local/rebuild_original.sh | 11 ++++++++ pulumi/tests/test-scripts-local/redeploy.sh | 19 ++++++++++++++ .../upload_remote_scripts.sh | 15 +++++++++++ pulumi/tests/test-scripts-remote/README.md | 1 + .../tests/test-scripts-remote/check_hashes.sh | 16 ++++++++++++ .../tests/test-scripts-remote/copy_to_qsfs.sh | 26 +++++++++++++++++++ .../tests/test-scripts-remote/write_data.sh | 22 ++++++++++++++++ 10 files changed, 127 insertions(+) create mode 100644 pulumi/tests/test-scripts-local/README.md create mode 100755 pulumi/tests/test-scripts-local/deploy.sh create mode 100755 pulumi/tests/test-scripts-local/destroy.sh create mode 100755 pulumi/tests/test-scripts-local/rebuild_original.sh create mode 100755 pulumi/tests/test-scripts-local/redeploy.sh create mode 100755 pulumi/tests/test-scripts-local/upload_remote_scripts.sh create mode 100644 pulumi/tests/test-scripts-remote/README.md create mode 100755 pulumi/tests/test-scripts-remote/check_hashes.sh create mode 100755 pulumi/tests/test-scripts-remote/copy_to_qsfs.sh create mode 100755 pulumi/tests/test-scripts-remote/write_data.sh diff --git a/pulumi/tests/test-scripts-local/README.md b/pulumi/tests/test-scripts-local/README.md new file mode 100644 index 0000000..e305e1e --- /dev/null +++ b/pulumi/tests/test-scripts-local/README.md @@ -0,0 +1 @@ +This folder contains scripts that run locally on the machine orchestrating the test. They are responsible for creating, updating, and destroying the deployment. diff --git a/pulumi/tests/test-scripts-local/deploy.sh b/pulumi/tests/test-scripts-local/deploy.sh new file mode 100755 index 0000000..0bd645d --- /dev/null +++ b/pulumi/tests/test-scripts-local/deploy.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi stack init test + +cp vars.original.py vars.py +pulumi up -s test -y --non-interactive diff --git a/pulumi/tests/test-scripts-local/destroy.sh b/pulumi/tests/test-scripts-local/destroy.sh new file mode 100755 index 0000000..92d90e7 --- /dev/null +++ b/pulumi/tests/test-scripts-local/destroy.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi down -s test -y --non-interactive +pulumi stack rm -yf test diff --git a/pulumi/tests/test-scripts-local/rebuild_original.sh b/pulumi/tests/test-scripts-local/rebuild_original.sh new file mode 100755 index 0000000..d90a422 --- /dev/null +++ b/pulumi/tests/test-scripts-local/rebuild_original.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +echo -e "\n===== Removing local data files and reconstructin from backends =====" + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 'rm /data/data/zdbfs-data/*' +ssh -t root@$ipv6 /root/test-scripts/check_hashes.sh diff --git a/pulumi/tests/test-scripts-local/redeploy.sh b/pulumi/tests/test-scripts-local/redeploy.sh new file mode 100755 index 0000000..42cb55e --- /dev/null +++ b/pulumi/tests/test-scripts-local/redeploy.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +pulumi stack init test + +cp vars.new.py vars.py +pulumi up -s test -y --non-interactive + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 ' + pkill zstor -SIGUSR1 + # Wait some time to let the rebuild process start. This should be enough? + sleep 10 +' + +# TODO: Need to figure out a way to check whether rebuilding has succeeded. diff --git a/pulumi/tests/test-scripts-local/upload_remote_scripts.sh b/pulumi/tests/test-scripts-local/upload_remote_scripts.sh new file mode 100755 index 0000000..d479efe --- /dev/null +++ b/pulumi/tests/test-scripts-local/upload_remote_scripts.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +# Use first argument if provided, otherwise default to 'test' +STACK_NAME=${1:-test} +ipv6=$(pulumi stack -s $STACK_NAME | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +# Get directory of script file. This way the path to upload is always correct +# regardless of where the script is run from +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +echo -e "\n===== Copying remote test scripts to the VM ====" +scp -o StrictHostKeyChecking=accept-new -r "$SCRIPT_DIR"/../test-scripts-remote/ "root@[$ipv6]:/root/test-scripts" diff --git a/pulumi/tests/test-scripts-remote/README.md b/pulumi/tests/test-scripts-remote/README.md new file mode 100644 index 0000000..530fd77 --- /dev/null +++ b/pulumi/tests/test-scripts-remote/README.md @@ -0,0 +1 @@ +This folder contains scripts that run on the remote machine during testing. They are basically about creating some data files and ensuring their integrity after being retrieved. diff --git a/pulumi/tests/test-scripts-remote/check_hashes.sh b/pulumi/tests/test-scripts-remote/check_hashes.sh new file mode 100755 index 0000000..2d0ef86 --- /dev/null +++ b/pulumi/tests/test-scripts-remote/check_hashes.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +echo -e "\n===== Calculating MD5 hashes of stored files =====" +# Calculate and MD5 hash for each file and write to new hash file. Overwrite +# the new hashes file since we might run this multiple times +rm -f /root/data/md5s_new +for i in {1..10}; do + md5sum /mnt/qsfs/file$i.dat | cut -d " " -f 1 >> /root/data/md5s_new +done + +echo -e "\n===== Comparing hashes ====" +if cmp -s /root/data/md5s_original /root/data/md5s_new; then + echo -e "\n===== Hashes match, success =====" +else + echo -e "\n===== Hashes differ, failure ====" +fi diff --git a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh new file mode 100755 index 0000000..ea06e82 --- /dev/null +++ b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# This script copies the data files into the QSFS and waits for uploading to +# complete. + +echo -e "\n===== Installing pv tool for transfer monitoring =====" +apt update &> /dev/null && apt install -y pv &> /dev/null + +echo -e "\n===== Copying files to QSFS mount with progress monitoring =====" +# Copy files to the qsfs mount and check speed +for i in {1..10}; do + echo "Copying file$i.dat..." + pv -s 100m "/root/data/file$i.dat" > "/mnt/qsfs/file$i.dat" +done + +echo -e "\n===== Waiting for all data files to upload =====" +# Here we are taking advantage of the fact that there is a 10 second delay +# before the last data file gets rotated, as specified in zinit/zdb.yaml. +# After the rotation, there will be a new file with a higher index number +# that is not uploaded to zstor, but we do not care about that file since it +# will have no data +for file in /data/data/zdbfs-data/*; do + while ! zstor -c /etc/zstor-default.toml check --file "$file" &> /dev/null; do + sleep 2 + done +done diff --git a/pulumi/tests/test-scripts-remote/write_data.sh b/pulumi/tests/test-scripts-remote/write_data.sh new file mode 100755 index 0000000..d6c0cd6 --- /dev/null +++ b/pulumi/tests/test-scripts-remote/write_data.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# This script generates random data files in regular storage on the VM then +# takes the checksum of each one and stores the sums in a file. By using +# regular storage, we get a baseline idea about write performance and also +# ensure that our later data integrity checks compare to something totally +# independent of the QSFS machinery. + +echo "===== Creating 10 test files with 100MB random data each =====" +# Create 10 files with 100mb random data +mkdir -p /root/data +for i in {1..10}; do + echo "Creating file$i.dat..." + dd if=/dev/urandom of=/root/data/file$i.dat bs=1M count=100 +done + +echo -e "\n===== Calculating MD5 checksums of source files =====" +# Calculate and MD5 sum for each file and write to file +touch /root/data/md5s_original +for i in {1..10}; do + md5sum /root/data/file$i.dat | cut -d " " -f 1 >> /root/data/md5s_original +done From 2d1cddf84f3f4707470bca2eb1d07a6c0dfb9c79 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 26 Nov 2024 22:02:23 -0800 Subject: [PATCH 19/32] Update vars.example.py --- pulumi/vars.example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pulumi/vars.example.py b/pulumi/vars.example.py index bb759c3..7aaeaa0 100644 --- a/pulumi/vars.example.py +++ b/pulumi/vars.example.py @@ -14,7 +14,7 @@ META_NODES = [1, 3, 5, 8] DATA_NODES = [1, 3, 5, 8] -# Size of each data backend Zdb +# Size of each data backend Zdb in GB DATA_SIZE = 1 # Network used to connect to the backend zdbs From dfc2679dc306ae6ea42e9d3cdfff1977d39b6b10 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 12 Dec 2024 17:06:37 -0800 Subject: [PATCH 20/32] Update triggers for config upload --- pulumi/__main__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index c6f67d7..3f2cb78 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -244,7 +244,11 @@ def make_zstor_config(args): connection=conn, source=pulumi.FileAsset(ZSTOR_CONFIG_PATH), remote_path=ZSTOR_CONFIG_REMOTE, - triggers=[conn.host], + # triggers=[conn.host], + # TODO: need to verify that that this works in both cases where we need to + # upload the config again: when the vm is changed and when any zdb is + # changed + triggers=list(deployments.values()), ) From 4517cc1032a6a54b3aea4c6a70ebea949dabe142 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Thu, 12 Dec 2024 18:08:24 -0800 Subject: [PATCH 21/32] Improve tests --- pulumi/__main__.py | 8 +++- pulumi/tests/rebuild/README.md | 19 ++++++++ pulumi/tests/rebuild/run.sh | 12 +++-- pulumi/tests/rebuild/scripts | 1 + pulumi/tests/rebuild/test-scripts/deploy.sh | 9 ---- pulumi/tests/rebuild/test-scripts/destroy.sh | 7 --- .../rebuild/test-scripts/rebuild_original.sh | 22 --------- pulumi/tests/rebuild/test-scripts/redeploy.sh | 19 -------- .../tests/rebuild/test-scripts/write_data.sh | 47 ------------------- pulumi/tests/rebuild/vars.new.py | 8 +++- pulumi/tests/rebuild/vars.original.py | 8 +++- pulumi/tests/rebuild/zinit | 1 + pulumi/tests/rebuild/zinit/node-exporter.yaml | 1 - pulumi/tests/rebuild/zinit/prometheus.yaml | 1 - pulumi/tests/rebuild/zinit/zdb.yaml | 10 ---- pulumi/tests/rebuild/zinit/zdbfs.yaml | 2 - pulumi/tests/rebuild/zinit/zstor.yaml | 6 --- .../tests/test-scripts-local/create_data.sh | 12 +++++ .../test-scripts-local/rebuild_original.sh | 2 +- .../tests/test-scripts-remote/check_hashes.sh | 2 +- .../tests/test-scripts-remote/copy_to_qsfs.sh | 1 + .../tests/test-scripts-remote/write_data.sh | 4 +- 22 files changed, 65 insertions(+), 137 deletions(-) create mode 100644 pulumi/tests/rebuild/README.md mode change 100644 => 100755 pulumi/tests/rebuild/run.sh create mode 120000 pulumi/tests/rebuild/scripts delete mode 100755 pulumi/tests/rebuild/test-scripts/deploy.sh delete mode 100755 pulumi/tests/rebuild/test-scripts/destroy.sh delete mode 100644 pulumi/tests/rebuild/test-scripts/rebuild_original.sh delete mode 100755 pulumi/tests/rebuild/test-scripts/redeploy.sh delete mode 100755 pulumi/tests/rebuild/test-scripts/write_data.sh create mode 120000 pulumi/tests/rebuild/zinit delete mode 100644 pulumi/tests/rebuild/zinit/node-exporter.yaml delete mode 100644 pulumi/tests/rebuild/zinit/prometheus.yaml delete mode 100644 pulumi/tests/rebuild/zinit/zdb.yaml delete mode 100644 pulumi/tests/rebuild/zinit/zdbfs.yaml delete mode 100644 pulumi/tests/rebuild/zinit/zstor.yaml create mode 100755 pulumi/tests/test-scripts-local/create_data.sh diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 3f2cb78..bc36f16 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -2,6 +2,7 @@ import secrets import shutil import textwrap +from pathlib import Path import pulumi import pulumi_random @@ -229,7 +230,7 @@ def make_zstor_config(args): # config file is the same -pulumi.Output.all( +zstor_config_output = pulumi.Output.all( deployments=[(d.vms_computed, d.zdbs_computed) for d in deployments.values()], zstor_key=zstor_key.hex, zdb_pw=zdb_pw.result, @@ -239,6 +240,10 @@ def make_zstor_config(args): conn = make_ssh_connection(vm) depends = [] +# CopyToRemote requires that there's any file in the path after the initial run +# of the code or it crashes. So we just put a dummy file here if needed. The +# config will get written and pulled into the VM anyway +Path(ZSTOR_CONFIG_PATH).touch() copy_zstor_config = pulumi_command.remote.CopyToRemote( "copy_zstor_config", connection=conn, @@ -249,6 +254,7 @@ def make_zstor_config(args): # upload the config again: when the vm is changed and when any zdb is # changed triggers=list(deployments.values()), + opts=pulumi.ResourceOptions(depends_on=[zstor_config_output]), ) diff --git a/pulumi/tests/rebuild/README.md b/pulumi/tests/rebuild/README.md new file mode 100644 index 0000000..f810a4b --- /dev/null +++ b/pulumi/tests/rebuild/README.md @@ -0,0 +1,19 @@ +This is meant to be a fully automated test of the rebuild/repair system in Zstor. + +It does these steps: + +1. Deploy a QSFS using an original configuration +2. Write some data into the QSFS (random files) +3. Replace one of the original Zdbs with a new one on a different node (do this for both data and metadata) +4. Upload a new config file to the frontend VM and try to hot reload the config using SIGUSR1 +5. Check the `status` output from zstor to see if some data has been written to the new backends + +Perhaps a better test would be to force zstor to rebuild the data from the new backend, by blocking access to enough of the original backends that using the new backend is necessary to fulfill the required shard count to rebuild. + +To use it, just: + +``` +./run.sh +``` + +This runs a set of scripts in the correct order. You can also run the scripts individually and inspect the state step by step. diff --git a/pulumi/tests/rebuild/run.sh b/pulumi/tests/rebuild/run.sh old mode 100644 new mode 100755 index 7ae1d91..8f4875d --- a/pulumi/tests/rebuild/run.sh +++ b/pulumi/tests/rebuild/run.sh @@ -1,6 +1,10 @@ #!/bin/bash -test-scripts/deploy.sh -test-scripts/write_data.sh -test-scripts/redeploy.sh -test-scripts/destroy.sh +source ../../venv/bin/activate + +../test-scripts-local/deploy.sh +../test-scripts-local/upload_remote_scripts.sh +../test-scripts-local/create_data.sh +../test-scripts-local/redeploy.sh +../test-scripts-local/rebuild_original.sh +../test-scripts-local/destroy.sh diff --git a/pulumi/tests/rebuild/scripts b/pulumi/tests/rebuild/scripts new file mode 120000 index 0000000..11aee1d --- /dev/null +++ b/pulumi/tests/rebuild/scripts @@ -0,0 +1 @@ +../../scripts/ \ No newline at end of file diff --git a/pulumi/tests/rebuild/test-scripts/deploy.sh b/pulumi/tests/rebuild/test-scripts/deploy.sh deleted file mode 100755 index 0bd645d..0000000 --- a/pulumi/tests/rebuild/test-scripts/deploy.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# We need this to run non interactively. Otherwise we'll be prompted for it -export PULUMI_CONFIG_PASSPHRASE="" - -pulumi stack init test - -cp vars.original.py vars.py -pulumi up -s test -y --non-interactive diff --git a/pulumi/tests/rebuild/test-scripts/destroy.sh b/pulumi/tests/rebuild/test-scripts/destroy.sh deleted file mode 100755 index 92d90e7..0000000 --- a/pulumi/tests/rebuild/test-scripts/destroy.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# We need this to run non interactively. Otherwise we'll be prompted for it -export PULUMI_CONFIG_PASSPHRASE="" - -pulumi down -s test -y --non-interactive -pulumi stack rm -yf test diff --git a/pulumi/tests/rebuild/test-scripts/rebuild_original.sh b/pulumi/tests/rebuild/test-scripts/rebuild_original.sh deleted file mode 100644 index 5ac824e..0000000 --- a/pulumi/tests/rebuild/test-scripts/rebuild_original.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# We need this to run non interactively. Otherwise we'll be prompted for it -export PULUMI_CONFIG_PASSPHRASE="" - -echo -e "\n===== Removing local data files and reconstructin from backends =====" - -ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) - -ssh -t root@$ipv6 ' - rm /data/data/zdbfs-data/* - for i in {1..10}; do - md5sum file$i.dat - done -' > md5s_new - -diff md5_original md5_new - -if cmp -s md5_original md5_new; then - echo -e "\n===== Hashes match after rebuild, success =====" -else - echo -e "\n===== Hashes differ after rebuild, failure ====" diff --git a/pulumi/tests/rebuild/test-scripts/redeploy.sh b/pulumi/tests/rebuild/test-scripts/redeploy.sh deleted file mode 100755 index 42cb55e..0000000 --- a/pulumi/tests/rebuild/test-scripts/redeploy.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# We need this to run non interactively. Otherwise we'll be prompted for it -export PULUMI_CONFIG_PASSPHRASE="" - -pulumi stack init test - -cp vars.new.py vars.py -pulumi up -s test -y --non-interactive - -ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) - -ssh -t root@$ipv6 ' - pkill zstor -SIGUSR1 - # Wait some time to let the rebuild process start. This should be enough? - sleep 10 -' - -# TODO: Need to figure out a way to check whether rebuilding has succeeded. diff --git a/pulumi/tests/rebuild/test-scripts/write_data.sh b/pulumi/tests/rebuild/test-scripts/write_data.sh deleted file mode 100755 index 775b2fd..0000000 --- a/pulumi/tests/rebuild/test-scripts/write_data.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# We need this to run non interactively. Otherwise we'll be prompted for it -export PULUMI_CONFIG_PASSPHRASE="" - -ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) - -ssh -t root@$ipv6 ' - echo "===== Creating 10 test files with 100MB random data each =====" - # Create 10 files with 100mb random data - for i in {1..10}; do - echo "Creating file$i.dat..." - dd if=/dev/urandom of=file$i.dat bs=1M count=100 - done -' - -ssh -t root@$ipv6 ' - echo -e "\n===== Calculating MD5 checksums of source files =====" - # Calculate and print MD5 sum for each file - for i in {1..10}; do - md5sum file$i.dat - done -' > md5s_original - -ssh -t root@$ipv6 ' - echo -e "\n===== Installing pv tool for transfer monitoring =====" - apt update &> /dev/null && apt install -y pv &> /dev/null - - echo -e "\n===== Copying files to QSFS mount with progress monitoring =====" - # Copy files to the qsfs mount and check speed - for i in {1..10}; do - echo "Copying file$i.dat..." - pv -s 100m "file$i.dat" > "/mnt/qsfs/file$i.dat" - done - - echo -e "\n===== Waiting for all data files to upload =====" - # Here we are taking advantage of the fact that there is a 10 second delay - # before the last data file gets rotated, as specified in zinit/zdb.yaml. - # After the rotation, there will be a new file with a higher index number - # that is not uploaded to zstor, but we do not care about that file since it - # will have no data - for file in /data/data/zdbfs-data/*; do - while ! zstor -c /etc/zstor-default.toml check "$file"; do - sleep 2 - done - done -' diff --git a/pulumi/tests/rebuild/vars.new.py b/pulumi/tests/rebuild/vars.new.py index ba07c79..3c7645b 100644 --- a/pulumi/tests/rebuild/vars.new.py +++ b/pulumi/tests/rebuild/vars.new.py @@ -2,8 +2,8 @@ MNEMONIC = "" NETWORK = "test" -# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub -SSH_KEY = "" + +SSH_KEY_PATH = "~/.ssh/id_rsa" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter VM_NODE = 5 @@ -18,3 +18,7 @@ # Network used to connect to the backend zdbs # ZDB_CONNECTION = "mycelium" ZDB_CONNECTION = "ipv6" + +# Network used for SSH connection +# SSH_CONNECTION = "mycelium" +SSH_CONNECTION = "ipv6" diff --git a/pulumi/tests/rebuild/vars.original.py b/pulumi/tests/rebuild/vars.original.py index f0958d2..cf78401 100644 --- a/pulumi/tests/rebuild/vars.original.py +++ b/pulumi/tests/rebuild/vars.original.py @@ -2,8 +2,8 @@ MNEMONIC = "" NETWORK = "test" -# Public SSH key. If empty, we'll attempt to read it from ~/.ssh/*.pub -SSH_KEY = "" + +SSH_KEY_PATH = "~/.ssh/id_rsa" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter VM_NODE = 5 @@ -18,3 +18,7 @@ # Network used to connect to the backend zdbs # ZDB_CONNECTION = "mycelium" ZDB_CONNECTION = "ipv6" + +# Network used for SSH connection +# SSH_CONNECTION = "mycelium" +SSH_CONNECTION = "ipv6" diff --git a/pulumi/tests/rebuild/zinit b/pulumi/tests/rebuild/zinit new file mode 120000 index 0000000..ed52882 --- /dev/null +++ b/pulumi/tests/rebuild/zinit @@ -0,0 +1 @@ +../../zinit/ \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/node-exporter.yaml b/pulumi/tests/rebuild/zinit/node-exporter.yaml deleted file mode 100644 index 1a466e1..0000000 --- a/pulumi/tests/rebuild/zinit/node-exporter.yaml +++ /dev/null @@ -1 +0,0 @@ -exec: prometheus-node-exporter diff --git a/pulumi/tests/rebuild/zinit/prometheus.yaml b/pulumi/tests/rebuild/zinit/prometheus.yaml deleted file mode 100644 index 3e07ecc..0000000 --- a/pulumi/tests/rebuild/zinit/prometheus.yaml +++ /dev/null @@ -1 +0,0 @@ -exec: prometheus --config.file=/etc/prometheus.yaml diff --git a/pulumi/tests/rebuild/zinit/zdb.yaml b/pulumi/tests/rebuild/zinit/zdb.yaml deleted file mode 100644 index 9e56d12..0000000 --- a/pulumi/tests/rebuild/zinit/zdb.yaml +++ /dev/null @@ -1,10 +0,0 @@ -exec: | - /usr/local/bin/zdb \ - --index /data/index \ - --data /data/data \ - --logfile /var/log/zdb.log \ - --datasize 67108864 \ - --hook /usr/local/bin/zdb-hook.sh \ - --rotate 10 -shutdown_timeout: 60 -after: [zstor] diff --git a/pulumi/tests/rebuild/zinit/zdbfs.yaml b/pulumi/tests/rebuild/zinit/zdbfs.yaml deleted file mode 100644 index c999d7d..0000000 --- a/pulumi/tests/rebuild/zinit/zdbfs.yaml +++ /dev/null @@ -1,2 +0,0 @@ -exec: /usr/local/bin/zdbfs /mnt/qsfs -o autons -after: [zdb] diff --git a/pulumi/tests/rebuild/zinit/zstor.yaml b/pulumi/tests/rebuild/zinit/zstor.yaml deleted file mode 100644 index 0f1a98f..0000000 --- a/pulumi/tests/rebuild/zinit/zstor.yaml +++ /dev/null @@ -1,6 +0,0 @@ -exec: | - /bin/zstor \ - -c /etc/zstor-default.toml \ - --log_file /var/log/zstor.log \ - monitor -shutdown_timeout: 300 diff --git a/pulumi/tests/test-scripts-local/create_data.sh b/pulumi/tests/test-scripts-local/create_data.sh new file mode 100755 index 0000000..315a8d7 --- /dev/null +++ b/pulumi/tests/test-scripts-local/create_data.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +echo -e "\n===== Removing local data files and reconstructin from backends =====" + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +ssh -t root@$ipv6 /root/test-scripts/write_data.sh +ssh -t root@$ipv6 /root/test-scripts/copy_to_qsfs.sh +ssh -t root@$ipv6 /root/test-scripts/check_hashes.sh diff --git a/pulumi/tests/test-scripts-local/rebuild_original.sh b/pulumi/tests/test-scripts-local/rebuild_original.sh index d90a422..c1718ff 100755 --- a/pulumi/tests/test-scripts-local/rebuild_original.sh +++ b/pulumi/tests/test-scripts-local/rebuild_original.sh @@ -3,7 +3,7 @@ # We need this to run non interactively. Otherwise we'll be prompted for it export PULUMI_CONFIG_PASSPHRASE="" -echo -e "\n===== Removing local data files and reconstructin from backends =====" +echo -e "\n===== Removing local data files and reconstructing from backends =====" ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) diff --git a/pulumi/tests/test-scripts-remote/check_hashes.sh b/pulumi/tests/test-scripts-remote/check_hashes.sh index 2d0ef86..9139a81 100755 --- a/pulumi/tests/test-scripts-remote/check_hashes.sh +++ b/pulumi/tests/test-scripts-remote/check_hashes.sh @@ -5,7 +5,7 @@ echo -e "\n===== Calculating MD5 hashes of stored files =====" # the new hashes file since we might run this multiple times rm -f /root/data/md5s_new for i in {1..10}; do - md5sum /mnt/qsfs/file$i.dat | cut -d " " -f 1 >> /root/data/md5s_new + md5sum /mnt/qsfs/file$i.dat | cut -d " " -f 1 | tee -a /root/data/md5s_new done echo -e "\n===== Comparing hashes ====" diff --git a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh index ea06e82..e793df5 100755 --- a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh +++ b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh @@ -23,4 +23,5 @@ for file in /data/data/zdbfs-data/*; do while ! zstor -c /etc/zstor-default.toml check --file "$file" &> /dev/null; do sleep 2 done + echo $file done diff --git a/pulumi/tests/test-scripts-remote/write_data.sh b/pulumi/tests/test-scripts-remote/write_data.sh index d6c0cd6..2d7de0f 100755 --- a/pulumi/tests/test-scripts-remote/write_data.sh +++ b/pulumi/tests/test-scripts-remote/write_data.sh @@ -16,7 +16,7 @@ done echo -e "\n===== Calculating MD5 checksums of source files =====" # Calculate and MD5 sum for each file and write to file -touch /root/data/md5s_original +rm -f /root/data/md5s_original for i in {1..10}; do - md5sum /root/data/file$i.dat | cut -d " " -f 1 >> /root/data/md5s_original + md5sum /root/data/file$i.dat | cut -d " " -f 1 | tee -a /root/data/md5s_original done From bf6de3cbeae0b2299a5a9ecec3b618516a3ac92d Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Mon, 16 Dec 2024 12:13:54 -0800 Subject: [PATCH 22/32] Use separate deployment for vm --- pulumi/__main__.py | 110 ++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index bc36f16..34846d0 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -75,6 +75,10 @@ elif ZDB_CONNECTION == "mycelium": ZDB_IP_INDEX = ZDB_MYC_INDEX +# CopyToRemote requires that the path used contains some file from the start, so +# we just put an empty one there if needed +Path(ZSTOR_CONFIG_PATH).touch() + provider = threefold.Provider( "provider", mnemonic=MNEMONIC, @@ -89,39 +93,41 @@ ip_range="10.1.0.0/16", # With mycelium enabled, we can't redeploy the vm # https://github.com/threefoldtech/pulumi-threefold/issues/552 + # Maybe it's okay though if we use separate deployements for vm and zdbs? # mycelium=True, opts=pulumi.ResourceOptions(provider=provider), ) -nodes = set([VM_NODE] + META_NODES + DATA_NODES) - -deployments = {} - -for node in nodes: - net_name = NET_NAME - vms = [] - depends = [] - if node == VM_NODE: - net_name = NET_NAME - depends.append(network) - vms.append( - threefold.VMInputArgs( - name="vm", - node_id=node, - flist=FLIST, - entrypoint="/sbin/zinit init", - network_name=net_name, - cpu=CPU, - memory=RAM, - rootfs_size=ROOTFS, - # mycelium=True, - planetary=True, - public_ip6=True, - env_vars={ - "SSH_KEY": ssh_public_key, - }, - ) +vm_deployment = threefold.Deployment( + "vm_deployment", + node_id=VM_NODE, + name="vm", + network_name=NET_NAME, + vms=[ + threefold.VMInputArgs( + name="vm", + node_id=VM_NODE, + flist=FLIST, + entrypoint="/sbin/zinit init", + network_name=NET_NAME, + cpu=CPU, + memory=RAM, + rootfs_size=ROOTFS, + # mycelium=True, + planetary=True, + public_ip6=True, + env_vars={ + "SSH_KEY": ssh_public_key, + }, ) + ], + opts=pulumi.ResourceOptions(provider=provider, depends_on=[network]), +) + +zdb_nodes = set(META_NODES + DATA_NODES) +zdb_deployments = [] + +for node in zdb_nodes: zdbs = [] if node in DATA_NODES: zdbs.append( @@ -142,14 +148,14 @@ ) ) - deployments[node] = threefold.Deployment( - "deployment" + str(node), - node_id=node, - name="node" + str(node), - network_name=net_name, - vms=vms, - zdbs=zdbs, - opts=pulumi.ResourceOptions(provider=provider, depends_on=depends), + zdb_deployments.append( + threefold.Deployment( + "zdb_deployment" + str(node), + node_id=node, + name="node" + str(node), + zdbs=zdbs, + opts=pulumi.ResourceOptions(provider=provider), + ) ) @@ -176,13 +182,11 @@ def make_zstor_config(args): shutil.copy(ZSTOR_CONFIG_BASE, path) + vm = args["vm"] meta_zdbs = [] data_zdbs = [] - for vm_list, zdb_list in args["deployments"]: - if vm_list: - vm = vm_list[0] - - for zdb in zdb_list: + for zdbs in args["zdbs"]: + for zdb in zdbs: if "meta" in zdb["namespace"]: meta_zdbs.append(zdb) else: @@ -221,29 +225,25 @@ def make_zstor_config(args): # This way the current file is always in the same place and we get around # the fact that it's not possible to return a path from this function and # use it as a FileAsset, because you can't pass an Output to FileAsset - shutil.copy(path, ZSTOR_CONFIG_PATH) - - # TODO: check if the new file is actually different than the previous one - # and if not, delete it. I guess we could have some better logic to - # actually detect if the zdbs have changed, but we still need to do the bit - # below to copy the file to the VM whenever we replace the VM, even if the - # config file is the same + if not open(path).read() == open(ZSTOR_CONFIG_PATH).read(): + shutil.copy(path, ZSTOR_CONFIG_PATH) + else: + # We end up regenerating the same file from time to time for reasons + # that may or may not be unavoidable. For now, just delete duplicates + os.remove(path) zstor_config_output = pulumi.Output.all( - deployments=[(d.vms_computed, d.zdbs_computed) for d in deployments.values()], + vm=vm_deployment.vms_computed[0], + zdbs=[d.zdbs_computed for d in zdb_deployments], zstor_key=zstor_key.hex, zdb_pw=zdb_pw.result, ).apply(make_zstor_config) -vm = deployments[VM_NODE].vms_computed[0] +vm = vm_deployment.vms_computed[0] conn = make_ssh_connection(vm) depends = [] -# CopyToRemote requires that there's any file in the path after the initial run -# of the code or it crashes. So we just put a dummy file here if needed. The -# config will get written and pulled into the VM anyway -Path(ZSTOR_CONFIG_PATH).touch() copy_zstor_config = pulumi_command.remote.CopyToRemote( "copy_zstor_config", connection=conn, @@ -253,7 +253,7 @@ def make_zstor_config(args): # TODO: need to verify that that this works in both cases where we need to # upload the config again: when the vm is changed and when any zdb is # changed - triggers=list(deployments.values()), + triggers=zdb_deployments + [vm_deployment], opts=pulumi.ResourceOptions(depends_on=[zstor_config_output]), ) From 43a5acaff9412b2512e604cece2372fda083bdda Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Mon, 16 Dec 2024 12:14:25 -0800 Subject: [PATCH 23/32] Remove unused original version secrets gen --- pulumi/__main__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 34846d0..8e870b3 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -1,5 +1,4 @@ import os -import secrets import shutil import textwrap from pathlib import Path @@ -65,8 +64,6 @@ META_SIZE = 1 # Generate separate secrets for Zstor key and Zdb namespaces passwords -ZSTOR_KEY = secrets.token_hex(32) -ZDB_PW = secrets.token_urlsafe(32) zstor_key = pulumi_random.RandomBytes("zstor_key", length=32) zdb_pw = pulumi_random.RandomPassword("zdb_pw", length=20) From ee090ff5c9268d43ff85a59f44df97dffcc9ef3f Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Mon, 16 Dec 2024 19:05:57 -0800 Subject: [PATCH 24/32] Update test --- pulumi/scripts/activate_qsfs.sh | 0 pulumi/scripts/prep_vm.sh | 0 pulumi/scripts/recover.sh | 0 pulumi/tests/rebuild/cleanup.sh | 5 +++++ pulumi/tests/rebuild/run.sh | 5 ++++- pulumi/tests/rebuild/vars.new.py | 6 +++--- pulumi/tests/rebuild/vars.original.py | 6 +++--- pulumi/tests/rebuild/zinit | 1 - pulumi/tests/rebuild/zinit/node-exporter.yaml | 1 + pulumi/tests/rebuild/zinit/prometheus.yaml | 1 + pulumi/tests/rebuild/zinit/zdb.yaml | 10 ++++++++++ pulumi/tests/rebuild/zinit/zdbfs.yaml | 1 + pulumi/tests/rebuild/zinit/zstor.yaml | 1 + pulumi/tests/test-scripts-local/create_data.sh | 5 ++++- pulumi/tests/test-scripts-local/recover.sh | 13 +++++++++++++ pulumi/tests/test-scripts-local/redeploy.sh | 11 ++++++----- .../{rebuild_original.sh => remove_and_rebuild.sh} | 0 pulumi/tests/test-scripts-remote/copy_to_qsfs.sh | 9 ++++----- pulumi/tests/test-scripts-remote/write_data.sh | 2 +- 19 files changed, 57 insertions(+), 20 deletions(-) mode change 100644 => 100755 pulumi/scripts/activate_qsfs.sh mode change 100644 => 100755 pulumi/scripts/prep_vm.sh mode change 100644 => 100755 pulumi/scripts/recover.sh create mode 100755 pulumi/tests/rebuild/cleanup.sh delete mode 120000 pulumi/tests/rebuild/zinit create mode 120000 pulumi/tests/rebuild/zinit/node-exporter.yaml create mode 120000 pulumi/tests/rebuild/zinit/prometheus.yaml create mode 100644 pulumi/tests/rebuild/zinit/zdb.yaml create mode 120000 pulumi/tests/rebuild/zinit/zdbfs.yaml create mode 120000 pulumi/tests/rebuild/zinit/zstor.yaml create mode 100755 pulumi/tests/test-scripts-local/recover.sh rename pulumi/tests/test-scripts-local/{rebuild_original.sh => remove_and_rebuild.sh} (100%) diff --git a/pulumi/scripts/activate_qsfs.sh b/pulumi/scripts/activate_qsfs.sh old mode 100644 new mode 100755 diff --git a/pulumi/scripts/prep_vm.sh b/pulumi/scripts/prep_vm.sh old mode 100644 new mode 100755 diff --git a/pulumi/scripts/recover.sh b/pulumi/scripts/recover.sh old mode 100644 new mode 100755 diff --git a/pulumi/tests/rebuild/cleanup.sh b/pulumi/tests/rebuild/cleanup.sh new file mode 100755 index 0000000..57e1a25 --- /dev/null +++ b/pulumi/tests/rebuild/cleanup.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +rm zstor_config.toml* +rm md5s_original +rm vars.py diff --git a/pulumi/tests/rebuild/run.sh b/pulumi/tests/rebuild/run.sh index 8f4875d..6b30190 100755 --- a/pulumi/tests/rebuild/run.sh +++ b/pulumi/tests/rebuild/run.sh @@ -6,5 +6,8 @@ source ../../venv/bin/activate ../test-scripts-local/upload_remote_scripts.sh ../test-scripts-local/create_data.sh ../test-scripts-local/redeploy.sh -../test-scripts-local/rebuild_original.sh +../test-scripts-local/recover.sh +../test-scripts-local/upload_remote_scripts.sh +../test-scripts-local/remove_and_rebuild.sh ../test-scripts-local/destroy.sh +./cleanup.sh diff --git a/pulumi/tests/rebuild/vars.new.py b/pulumi/tests/rebuild/vars.new.py index 3c7645b..96afbfb 100644 --- a/pulumi/tests/rebuild/vars.new.py +++ b/pulumi/tests/rebuild/vars.new.py @@ -1,15 +1,15 @@ # These are the new values used in the test MNEMONIC = "" -NETWORK = "test" +NETWORK = "main" SSH_KEY_PATH = "~/.ssh/id_rsa" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter -VM_NODE = 5 +VM_NODE = 1 # Nodes to deploy Zdbs on -META_NODES = [1, 2, 3, 7] +META_NODES = [8, 10, 11, 24] DATA_NODES = META_NODES # Size of each data backend Zdb diff --git a/pulumi/tests/rebuild/vars.original.py b/pulumi/tests/rebuild/vars.original.py index cf78401..e4d027d 100644 --- a/pulumi/tests/rebuild/vars.original.py +++ b/pulumi/tests/rebuild/vars.original.py @@ -1,15 +1,15 @@ # These are the original values used in the test MNEMONIC = "" -NETWORK = "test" +NETWORK = "main" SSH_KEY_PATH = "~/.ssh/id_rsa" # Node to deploy VM on. Can overlap with Zdb nodes or not, doesn't matter -VM_NODE = 5 +VM_NODE = 13 # Nodes to deploy Zdbs on -META_NODES = [1, 2, 3, 5] +META_NODES = [8, 10, 11, 13] DATA_NODES = META_NODES # Size of each data backend Zdb diff --git a/pulumi/tests/rebuild/zinit b/pulumi/tests/rebuild/zinit deleted file mode 120000 index ed52882..0000000 --- a/pulumi/tests/rebuild/zinit +++ /dev/null @@ -1 +0,0 @@ -../../zinit/ \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/node-exporter.yaml b/pulumi/tests/rebuild/zinit/node-exporter.yaml new file mode 120000 index 0000000..d72c19d --- /dev/null +++ b/pulumi/tests/rebuild/zinit/node-exporter.yaml @@ -0,0 +1 @@ +../../../zinit/node-exporter.yaml \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/prometheus.yaml b/pulumi/tests/rebuild/zinit/prometheus.yaml new file mode 120000 index 0000000..f5ffa37 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/prometheus.yaml @@ -0,0 +1 @@ +../../../zinit/prometheus.yaml \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/zdb.yaml b/pulumi/tests/rebuild/zinit/zdb.yaml new file mode 100644 index 0000000..9e56d12 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zdb.yaml @@ -0,0 +1,10 @@ +exec: | + /usr/local/bin/zdb \ + --index /data/index \ + --data /data/data \ + --logfile /var/log/zdb.log \ + --datasize 67108864 \ + --hook /usr/local/bin/zdb-hook.sh \ + --rotate 10 +shutdown_timeout: 60 +after: [zstor] diff --git a/pulumi/tests/rebuild/zinit/zdbfs.yaml b/pulumi/tests/rebuild/zinit/zdbfs.yaml new file mode 120000 index 0000000..c26e599 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zdbfs.yaml @@ -0,0 +1 @@ +../../../zinit/zdbfs.yaml \ No newline at end of file diff --git a/pulumi/tests/rebuild/zinit/zstor.yaml b/pulumi/tests/rebuild/zinit/zstor.yaml new file mode 120000 index 0000000..b37e568 --- /dev/null +++ b/pulumi/tests/rebuild/zinit/zstor.yaml @@ -0,0 +1 @@ +../../../zinit/zstor.yaml \ No newline at end of file diff --git a/pulumi/tests/test-scripts-local/create_data.sh b/pulumi/tests/test-scripts-local/create_data.sh index 315a8d7..3552d04 100755 --- a/pulumi/tests/test-scripts-local/create_data.sh +++ b/pulumi/tests/test-scripts-local/create_data.sh @@ -3,10 +3,13 @@ # We need this to run non interactively. Otherwise we'll be prompted for it export PULUMI_CONFIG_PASSPHRASE="" -echo -e "\n===== Removing local data files and reconstructin from backends =====" +echo -e "\n===== Writing data, copying data to QSFS, and checking hashes =====" ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) ssh -t root@$ipv6 /root/test-scripts/write_data.sh ssh -t root@$ipv6 /root/test-scripts/copy_to_qsfs.sh ssh -t root@$ipv6 /root/test-scripts/check_hashes.sh + +# Store a copy of the hashes locally, in case we redeploy the VM +scp "root@[$ipv6]:/root/data/md5s_original" ./ diff --git a/pulumi/tests/test-scripts-local/recover.sh b/pulumi/tests/test-scripts-local/recover.sh new file mode 100755 index 0000000..5aac918 --- /dev/null +++ b/pulumi/tests/test-scripts-local/recover.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# We need this to run non interactively. Otherwise we'll be prompted for it +export PULUMI_CONFIG_PASSPHRASE="" + +echo -e "\n===== Running recover script on remote VM =====" + +ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) + +# The scripts uploaded by Pulumi won't be executable. We could fix that elsewhere +ssh -t root@$ipv6 bash /root/scripts/recover.sh +ssh -t root@$ipv6 mkdir /root/data +scp ./md5s_original "root@[$ipv6]:/root/data/md5s_original" diff --git a/pulumi/tests/test-scripts-local/redeploy.sh b/pulumi/tests/test-scripts-local/redeploy.sh index 42cb55e..4c22be9 100755 --- a/pulumi/tests/test-scripts-local/redeploy.sh +++ b/pulumi/tests/test-scripts-local/redeploy.sh @@ -3,6 +3,8 @@ # We need this to run non interactively. Otherwise we'll be prompted for it export PULUMI_CONFIG_PASSPHRASE="" +echo -e "\n===== Redeploying with vars.new.py and issuing SIGUSR1 =====" + pulumi stack init test cp vars.new.py vars.py @@ -10,10 +12,9 @@ pulumi up -s test -y --non-interactive ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut -d '/' -f 1) -ssh -t root@$ipv6 ' +# Since we might use this script both in cases where the frontend VM is +# replaced or not, we'll just go ahead and try to issue the SIGUSR1 even though +# it has no effect on a fresh VM +ssh -o StrictHostKeyChecking=accept-new -t root@$ipv6 ' pkill zstor -SIGUSR1 - # Wait some time to let the rebuild process start. This should be enough? - sleep 10 ' - -# TODO: Need to figure out a way to check whether rebuilding has succeeded. diff --git a/pulumi/tests/test-scripts-local/rebuild_original.sh b/pulumi/tests/test-scripts-local/remove_and_rebuild.sh similarity index 100% rename from pulumi/tests/test-scripts-local/rebuild_original.sh rename to pulumi/tests/test-scripts-local/remove_and_rebuild.sh diff --git a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh index e793df5..06c1f54 100755 --- a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh +++ b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh @@ -14,11 +14,10 @@ for i in {1..10}; do done echo -e "\n===== Waiting for all data files to upload =====" -# Here we are taking advantage of the fact that there is a 10 second delay -# before the last data file gets rotated, as specified in zinit/zdb.yaml. -# After the rotation, there will be a new file with a higher index number -# that is not uploaded to zstor, but we do not care about that file since it -# will have no data +# At this point, all the data is in zdb data files. Since we set the rotation +# time for 10 seconds, the last data file should get rotated and uploaded via +# zstor without much delay. At that point, a new empty data file will be +# created, which we don't care about for file in /data/data/zdbfs-data/*; do while ! zstor -c /etc/zstor-default.toml check --file "$file" &> /dev/null; do sleep 2 diff --git a/pulumi/tests/test-scripts-remote/write_data.sh b/pulumi/tests/test-scripts-remote/write_data.sh index 2d7de0f..18e5812 100755 --- a/pulumi/tests/test-scripts-remote/write_data.sh +++ b/pulumi/tests/test-scripts-remote/write_data.sh @@ -11,7 +11,7 @@ echo "===== Creating 10 test files with 100MB random data each =====" mkdir -p /root/data for i in {1..10}; do echo "Creating file$i.dat..." - dd if=/dev/urandom of=/root/data/file$i.dat bs=1M count=100 + dd if=/dev/urandom of=/root/data/file$i.dat bs=1M count=10 done echo -e "\n===== Calculating MD5 checksums of source files =====" From f0fc3fc949e29bfedaf706d2ec35711f64c6cf89 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 17 Dec 2024 10:46:53 -0800 Subject: [PATCH 25/32] Update README with backend replacement details --- pulumi/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pulumi/README.md b/pulumi/README.md index c709c22..eb28f9c 100644 --- a/pulumi/README.md +++ b/pulumi/README.md @@ -56,6 +56,24 @@ If you want to destroy the deployment, bring it down like this: pulumi down ``` +## Replacing backends + +If you want to replace any data or metadata backends, just edit `vars.py` and run `pulumi up` again. Note that this is a destructive operation and any backends not present in the new config will be decomissioned. Data loss is possible if too many backends are decommissioned at one time without rebuilding the data. You must have the minimal shard count available to be able to reconstruct the data. + +After running `pulumi up` with the new config, the Pulumi script will automatically upload an updated Zstor config file to the VM. However, Zstor will not start using the new config automatically. You either need to restart Zstor or perform a hot reload of the config by sending the SIGUSR1 signal to Zstor: + +``` +pkill zstor -SIGUSR1 +``` + +Once the new config is loaded, Zstor will automatically start writing data or metadata to the new backends to restore the desired shard count for each stored file. This can take up to ten minutes to be triggered. + +You can check the progress of rebuilding using the Zstor `status` command: + +``` +zstor -c /etc/zstor-default.toml status +``` + ## Recover to new VM If you need to replace the frontend VM for any reason, such as a node outage, follow these steps. Any data that has been uploaded to the backends can be recovered into the new VM. Any data that was not yet uploaded to the backends will be lost. From 31dfd375c389f0e2374470e47ed2ae768f2279cd Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Tue, 17 Dec 2024 17:54:14 -0800 Subject: [PATCH 26/32] Support deploying with no frontend VM --- pulumi/__main__.py | 191 +++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 94 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 8e870b3..0c96588 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -95,31 +95,34 @@ opts=pulumi.ResourceOptions(provider=provider), ) -vm_deployment = threefold.Deployment( - "vm_deployment", - node_id=VM_NODE, - name="vm", - network_name=NET_NAME, - vms=[ - threefold.VMInputArgs( - name="vm", - node_id=VM_NODE, - flist=FLIST, - entrypoint="/sbin/zinit init", - network_name=NET_NAME, - cpu=CPU, - memory=RAM, - rootfs_size=ROOTFS, - # mycelium=True, - planetary=True, - public_ip6=True, - env_vars={ - "SSH_KEY": ssh_public_key, - }, - ) - ], - opts=pulumi.ResourceOptions(provider=provider, depends_on=[network]), -) +if VM_NODE is not None: + vm_deployment = threefold.Deployment( + "vm_deployment", + node_id=VM_NODE, + name="vm", + network_name=NET_NAME, + vms=[ + threefold.VMInputArgs( + name="vm", + node_id=VM_NODE, + flist=FLIST, + entrypoint="/sbin/zinit init", + network_name=NET_NAME, + cpu=CPU, + memory=RAM, + rootfs_size=ROOTFS, + # mycelium=True, + planetary=True, + public_ip6=True, + env_vars={ + "SSH_KEY": ssh_public_key, + }, + ) + ], + opts=pulumi.ResourceOptions(provider=provider, depends_on=[network]), + ) +else: + vm_deployment = None zdb_nodes = set(META_NODES + DATA_NODES) zdb_deployments = [] @@ -237,84 +240,84 @@ def make_zstor_config(args): zdb_pw=zdb_pw.result, ).apply(make_zstor_config) -vm = vm_deployment.vms_computed[0] -conn = make_ssh_connection(vm) -depends = [] - -copy_zstor_config = pulumi_command.remote.CopyToRemote( - "copy_zstor_config", - connection=conn, - source=pulumi.FileAsset(ZSTOR_CONFIG_PATH), - remote_path=ZSTOR_CONFIG_REMOTE, - # triggers=[conn.host], - # TODO: need to verify that that this works in both cases where we need to - # upload the config again: when the vm is changed and when any zdb is - # changed - triggers=zdb_deployments + [vm_deployment], - opts=pulumi.ResourceOptions(depends_on=[zstor_config_output]), -) - +if vm_deployment: + vm = vm_deployment.vms_computed[0] + conn = make_ssh_connection(vm) + depends = [] + + copy_zstor_config = pulumi_command.remote.CopyToRemote( + "copy_zstor_config", + connection=conn, + source=pulumi.FileAsset(ZSTOR_CONFIG_PATH), + remote_path=ZSTOR_CONFIG_REMOTE, + # triggers=[conn.host], + # TODO: need to verify that that this works in both cases where we need to + # upload the config again: when the vm is changed and when any zdb is + # changed + triggers=zdb_deployments + [vm_deployment], + opts=pulumi.ResourceOptions(depends_on=[zstor_config_output]), + ) -if os.path.isfile("prometheus.yaml"): - depends.append( - pulumi_command.remote.CopyToRemote( - "copy_prometheus", - connection=conn, - source=pulumi.FileAsset("prometheus.yaml"), - remote_path="/etc/prometheus.yaml", - triggers=[conn.host], + if os.path.isfile("prometheus.yaml"): + depends.append( + pulumi_command.remote.CopyToRemote( + "copy_prometheus", + connection=conn, + source=pulumi.FileAsset("prometheus.yaml"), + remote_path="/etc/prometheus.yaml", + triggers=[conn.host], + ) ) - ) -# In case we want to test our own zstor binary, such as a prebuild -if os.path.isfile("zstor"): - depends.append( - pulumi_command.remote.CopyToRemote( - "copy_zstor_binary", - connection=conn, - source=pulumi.FileAsset("zstor"), - remote_path="/usr/bin/zstor", - triggers=[conn.host], + # In case we want to test our own zstor binary, such as a prebuild + if os.path.isfile("zstor"): + depends.append( + pulumi_command.remote.CopyToRemote( + "copy_zstor_binary", + connection=conn, + source=pulumi.FileAsset("zstor"), + remote_path="/usr/bin/zstor", + triggers=[conn.host], + ) ) - ) -# We put the zinit files under /root to start, so that the services don't get -# started accidentally on reboot. In the case of recovering on a new VM, we -# need to ensure some other steps are completed first -copy_zinit = pulumi_command.remote.CopyToRemote( - "copy_zinit", - connection=conn, - source=pulumi.FileArchive("zinit/"), - remote_path="/root/zinit/", - triggers=[conn.host], -) + # We put the zinit files under /root to start, so that the services don't get + # started accidentally on reboot. In the case of recovering on a new VM, we + # need to ensure some other steps are completed first + copy_zinit = pulumi_command.remote.CopyToRemote( + "copy_zinit", + connection=conn, + source=pulumi.FileArchive("zinit/"), + remote_path="/root/zinit/", + triggers=[conn.host], + ) -copy_scripts = pulumi_command.remote.CopyToRemote( - "copy_scripts", - connection=conn, - source=pulumi.FileArchive("scripts/"), - remote_path="/root/scripts/", - triggers=[conn.host], -) + copy_scripts = pulumi_command.remote.CopyToRemote( + "copy_scripts", + connection=conn, + source=pulumi.FileArchive("scripts/"), + remote_path="/root/scripts/", + triggers=[conn.host], + ) -depends.append(copy_scripts) + depends.append(copy_scripts) -prep_vm = pulumi_command.remote.Command( - "prep_vm", - connection=conn, - create="bash /root/scripts/prep_vm.sh 2>&1 | tee > /var/log/prep_vm.log", - triggers=[conn.host], - opts=pulumi.ResourceOptions(depends_on=depends), -) + prep_vm = pulumi_command.remote.Command( + "prep_vm", + connection=conn, + create="bash /root/scripts/prep_vm.sh 2>&1 | tee > /var/log/prep_vm.log", + triggers=[conn.host], + opts=pulumi.ResourceOptions(depends_on=depends), + ) -depends.extend([prep_vm, copy_zinit, copy_zstor_config]) -pulumi_command.remote.Command( - "activate_qsfs", - connection=conn, - create="bash /root/scripts/activate_qsfs.sh 2>&1 | tee > /var/log/activate_qsfs.log", - update="", - opts=pulumi.ResourceOptions(depends_on=depends), -) + depends.extend([prep_vm, copy_zinit, copy_zstor_config]) + pulumi_command.remote.Command( + "activate_qsfs", + connection=conn, + create="bash /root/scripts/activate_qsfs.sh 2>&1 | tee > /var/log/activate_qsfs.log", + update="", + opts=pulumi.ResourceOptions(depends_on=depends), + ) pulumi.export("mycelium_ip", vm.mycelium_ip) pulumi.export("pub_ipv6", vm.computed_ip6) From 4658465663301fd2f3d76ae186aee1dc3b7f78e2 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 18 Dec 2024 16:19:33 -0800 Subject: [PATCH 27/32] Only deploy network with vm, fix zstor config upload --- pulumi/__main__.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 0c96588..c520b49 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -82,20 +82,22 @@ network=NETWORK, ) -network = threefold.Network( - "network", - name=NET_NAME, - description="A network", - nodes=[VM_NODE], - ip_range="10.1.0.0/16", - # With mycelium enabled, we can't redeploy the vm - # https://github.com/threefoldtech/pulumi-threefold/issues/552 - # Maybe it's okay though if we use separate deployements for vm and zdbs? - # mycelium=True, - opts=pulumi.ResourceOptions(provider=provider), -) - +# Deploying a VM is optional. Some users might want to use an existing VM or +# another system for their QFS frontend if VM_NODE is not None: + network = threefold.Network( + "network", + name=NET_NAME, + description="A network", + nodes=[VM_NODE], + ip_range="10.1.0.0/16", + # With mycelium enabled, we can't redeploy the vm + # https://github.com/threefoldtech/pulumi-threefold/issues/552 + # Maybe it's okay though if we use separate deployements for vm and zdbs? + # mycelium=True, + opts=pulumi.ResourceOptions(provider=provider), + ) + vm_deployment = threefold.Deployment( "vm_deployment", node_id=VM_NODE, @@ -250,12 +252,11 @@ def make_zstor_config(args): connection=conn, source=pulumi.FileAsset(ZSTOR_CONFIG_PATH), remote_path=ZSTOR_CONFIG_REMOTE, - # triggers=[conn.host], - # TODO: need to verify that that this works in both cases where we need to - # upload the config again: when the vm is changed and when any zdb is - # changed - triggers=zdb_deployments + [vm_deployment], - opts=pulumi.ResourceOptions(depends_on=[zstor_config_output]), + # Without this trigger, a new upload isn't triggered when the VM is + # replaced. However, the file on an existing VM gets updated just with + # zstor_config_output in the depends_on list + triggers=[conn.host], + opts=pulumi.ResourceOptions(depends_on=[zstor_config_output, vm_deployment]), ) if os.path.isfile("prometheus.yaml"): From 2783f531714c8d10193e3e915b03f662f68808ed Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 18 Dec 2024 16:20:10 -0800 Subject: [PATCH 28/32] Wait for hash to print on file uploads --- pulumi/tests/test-scripts-remote/copy_to_qsfs.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh index 06c1f54..71bfaeb 100755 --- a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh +++ b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh @@ -18,9 +18,17 @@ echo -e "\n===== Waiting for all data files to upload =====" # time for 10 seconds, the last data file should get rotated and uploaded via # zstor without much delay. At that point, a new empty data file will be # created, which we don't care about +ls -lh /data/data/zdbfs-data for file in /data/data/zdbfs-data/*; do - while ! zstor -c /etc/zstor-default.toml check --file "$file" &> /dev/null; do +while true; do + # Originally we just looked at the exit code of `check` but this was not + # reliable. We need to wait for a hash output to be sure zstor has finished + # storing the file + check_output=$(zstor -c /etc/zstor-default.toml check --file "$file") + if [ ! -z "$check_output" ]; then + echo $file $check_output + break + fi sleep 2 done - echo $file done From c231e5f1e789158e3041f2d4cf8f80ee0e14032d Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Wed, 18 Dec 2024 16:40:21 -0800 Subject: [PATCH 29/32] Don't delete stack unless `down` succeeds --- pulumi/tests/test-scripts-local/destroy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pulumi/tests/test-scripts-local/destroy.sh b/pulumi/tests/test-scripts-local/destroy.sh index 92d90e7..336668b 100755 --- a/pulumi/tests/test-scripts-local/destroy.sh +++ b/pulumi/tests/test-scripts-local/destroy.sh @@ -3,5 +3,5 @@ # We need this to run non interactively. Otherwise we'll be prompted for it export PULUMI_CONFIG_PASSPHRASE="" -pulumi down -s test -y --non-interactive -pulumi stack rm -yf test +# If we fail to delete the deployment, we should keep the stack around +pulumi down -s test -y --non-interactive && pulumi stack rm -yf test From 724e231c0037f15b200f9c42100a128a84e606a3 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Fri, 20 Dec 2024 19:02:10 -0800 Subject: [PATCH 30/32] Add retry uploads --- pulumi/scripts/activate_qsfs.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pulumi/scripts/activate_qsfs.sh b/pulumi/scripts/activate_qsfs.sh index 6ffda12..c7591e7 100755 --- a/pulumi/scripts/activate_qsfs.sh +++ b/pulumi/scripts/activate_qsfs.sh @@ -16,12 +16,14 @@ echo Copying zinit service files cp /root/zinit/zstor.yaml /etc/zinit cp /root/zinit/zdb.yaml /etc/zinit cp /root/zinit/zdbfs.yaml /etc/zinit +cp /root/zinit/retry-uploads.yaml /etc/zinit echo echo Starting up zinit services zinit monitor zstor zinit monitor zdb zinit monitor zdbfs +zinit monitor retry-uploads if [ -f /etc/prometheus.yaml ]; then echo From 5696a297c5988e01b8fdb09c0b956a9baf24b02f Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Fri, 20 Dec 2024 19:02:57 -0800 Subject: [PATCH 31/32] Dedicated script to wait for uploads --- .../tests/test-scripts-local/create_data.sh | 1 + .../tests/test-scripts-remote/copy_to_qsfs.sh | 20 ----------- .../test-scripts-remote/wait_all_uploads.sh | 34 +++++++++++++++++++ 3 files changed, 35 insertions(+), 20 deletions(-) create mode 100755 pulumi/tests/test-scripts-remote/wait_all_uploads.sh diff --git a/pulumi/tests/test-scripts-local/create_data.sh b/pulumi/tests/test-scripts-local/create_data.sh index 3552d04..e127e1f 100755 --- a/pulumi/tests/test-scripts-local/create_data.sh +++ b/pulumi/tests/test-scripts-local/create_data.sh @@ -9,6 +9,7 @@ ipv6=$(pulumi stack -s test | grep pub_ipv6 | tr -s " " | cut -d ' ' -f 3 | cut ssh -t root@$ipv6 /root/test-scripts/write_data.sh ssh -t root@$ipv6 /root/test-scripts/copy_to_qsfs.sh +ssh -t root@$ipv6 /root/test-scripts/wait_all_uploads.sh ssh -t root@$ipv6 /root/test-scripts/check_hashes.sh # Store a copy of the hashes locally, in case we redeploy the VM diff --git a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh index 71bfaeb..ca01b9c 100755 --- a/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh +++ b/pulumi/tests/test-scripts-remote/copy_to_qsfs.sh @@ -12,23 +12,3 @@ for i in {1..10}; do echo "Copying file$i.dat..." pv -s 100m "/root/data/file$i.dat" > "/mnt/qsfs/file$i.dat" done - -echo -e "\n===== Waiting for all data files to upload =====" -# At this point, all the data is in zdb data files. Since we set the rotation -# time for 10 seconds, the last data file should get rotated and uploaded via -# zstor without much delay. At that point, a new empty data file will be -# created, which we don't care about -ls -lh /data/data/zdbfs-data -for file in /data/data/zdbfs-data/*; do -while true; do - # Originally we just looked at the exit code of `check` but this was not - # reliable. We need to wait for a hash output to be sure zstor has finished - # storing the file - check_output=$(zstor -c /etc/zstor-default.toml check --file "$file") - if [ ! -z "$check_output" ]; then - echo $file $check_output - break - fi - sleep 2 - done -done diff --git a/pulumi/tests/test-scripts-remote/wait_all_uploads.sh b/pulumi/tests/test-scripts-remote/wait_all_uploads.sh new file mode 100755 index 0000000..ced718c --- /dev/null +++ b/pulumi/tests/test-scripts-remote/wait_all_uploads.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# This script waits for all files to be uploaded to zstor (the ones that are +# expected to be uploaded, anyway). + +echo -e "\n===== Waiting for all data files to upload =====" + +wait_for_upload() { + while [ -z $(zstor -c /etc/zstor-default.toml check --file "$1") ]; do + sleep 2 + done + echo $1 +} + +for namespace in "zdbfs-data" "zdbfs-meta"; do + namespace_file="/data/index/$namespace/zdb-namespace" + if [ -f "$namespace_file" ]; then + wait_for_upload $namespace_file + else + echo Namespace file missing: $namespace_file + fi + + for type in "data" "index"; do + # The index directory also has the namespace file, so we exclude that by + # only looking for files starting with d or i + path_base=/data/$type/$namespace/${type:0:1} + # We want to check every file except for the largest sequence number, so + # we sort and throw away the last row. Here an ls even without -1 helps + # sort to work, while echo does't. Not sure why + for file in $(ls -1 $path_base* | sort -V | head -n -1); do + wait_for_upload $file + done + done +done From 5d3d07a52594fdee3950fc384e89770a0cae8284 Mon Sep 17 00:00:00 2001 From: Scott Yeager Date: Fri, 20 Dec 2024 19:05:04 -0800 Subject: [PATCH 32/32] Add Prometheus push gateway --- pulumi/prometheus.example.yaml | 4 ++++ pulumi/scripts/prep_vm.sh | 6 +++++- pulumi/zinit/prometheus-pushgateway.yaml | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 pulumi/zinit/prometheus-pushgateway.yaml diff --git a/pulumi/prometheus.example.yaml b/pulumi/prometheus.example.yaml index 3c13abc..298eb12 100644 --- a/pulumi/prometheus.example.yaml +++ b/pulumi/prometheus.example.yaml @@ -14,3 +14,7 @@ scrape_configs: - job_name: node-exporter-zstor static_configs: - targets: ["localhost:9100"] + - job_name: pushgateway-zstor + static_configs: + - targets: ["localhost:9091"] + honor_labels: true diff --git a/pulumi/scripts/prep_vm.sh b/pulumi/scripts/prep_vm.sh index 42fa3c5..83ae137 100755 --- a/pulumi/scripts/prep_vm.sh +++ b/pulumi/scripts/prep_vm.sh @@ -19,6 +19,10 @@ if ! [ -f /usr/local/bin/zdb-hook.sh ]; then wget -O /usr/local/bin/zdb-hook.sh https://raw.githubusercontent.com/threefoldtech/quantum-storage/master/lib/zdb-hook.sh fi +if ! [ -f /usr/local/bin/retry-uploads.sh ]; then + wget -O /usr/local/bin/retry-uploads.sh https://raw.githubusercontent.com/threefoldtech/quantum-storage/master/lib/retry-uploads.sh +fi + if ! [ -f /bin/zstor ]; then wget -O /bin/zstor https://github.com/threefoldtech/0-stor_v2/releases/download/v0.4.0/zstor_v2-x86_64-linux-musl fi @@ -31,5 +35,5 @@ if [ -f /etc/prometheus.yaml ]; then echo echo Installing Prometheus apt update - apt install -y prometheus + apt install -y prometheus prometheus-pushgateway curl fi diff --git a/pulumi/zinit/prometheus-pushgateway.yaml b/pulumi/zinit/prometheus-pushgateway.yaml new file mode 100644 index 0000000..092fc4d --- /dev/null +++ b/pulumi/zinit/prometheus-pushgateway.yaml @@ -0,0 +1 @@ +exec: prometheus-pushgateway --persistence.file=/var/lib/prometheus/pushgateway-persistence.data