diff --git a/.github/workflows/helm-ci.yaml b/.github/workflows/helm-ci.yaml index 5d31409..061a4b5 100644 --- a/.github/workflows/helm-ci.yaml +++ b/.github/workflows/helm-ci.yaml @@ -6,12 +6,14 @@ on: paths: - 'client/**' - 'ingestor/**' + - 'scripts/**' - '.github/workflows/helm-ci.yaml' pull_request: branches: [main, develop, openshift] paths: - 'client/**' - 'ingestor/**' + - 'scripts/**' - '.github/workflows/helm-ci.yaml' jobs: @@ -115,3 +117,29 @@ jobs: -f client/ci/${{ matrix.platform }}-values.yaml \ > /dev/null echo "Schema validation passed for ${{ matrix.platform }}" + + installer-tests: + name: Installer script tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install bats + run: sudo apt-get update -qq && sudo apt-get install -y -qq bats + + - name: bats unit tests (bash installer) + run: bats scripts/tests/*.bats + + - name: Pester unit tests (PowerShell installer) + shell: pwsh + env: + TB_PESTER: "1" + run: | + Set-PSRepository PSGallery -InstallationPolicy Trusted + Install-Module Pester -MinimumVersion 5.5.0 -Force -SkipPublisherCheck -Scope CurrentUser + Import-Module Pester -MinimumVersion 5.5.0 -Force + $cfg = New-PesterConfiguration + $cfg.Run.Path = "scripts/tests/install-k8s.Tests.ps1" + $cfg.Run.Exit = $true + $cfg.Output.Verbosity = "Detailed" + Invoke-Pester -Configuration $cfg diff --git a/scripts/install-k8s.ps1 b/scripts/install-k8s.ps1 index 1e7dadb..bcacd9a 100644 --- a/scripts/install-k8s.ps1 +++ b/scripts/install-k8s.ps1 @@ -26,14 +26,18 @@ param([switch]$Help, [switch]$NoReboot) # -- Admin check -------------------------------------------------------------- -$isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent() - ).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) -if (-not $isAdmin) { - Write-Host " " -NoNewline; Write-Host ([char]0x2716) -ForegroundColor Red -NoNewline; Write-Host " Run this script as Administrator (right-click > Run as Administrator)." -ForegroundColor Red - exit 1 -} +# $env:TB_PESTER lets the test suite dot-source this file to load the functions +# without triggering the admin gate (which throws off-Windows) or running main. +if (-not $env:TB_PESTER) { + $isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent() + ).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) + if (-not $isAdmin) { + Write-Host " " -NoNewline; Write-Host ([char]0x2716) -ForegroundColor Red -NoNewline; Write-Host " Run this script as Administrator (right-click > Run as Administrator)." -ForegroundColor Red + exit 1 + } -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 + [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 +} # ============================================================================= # HELPERS — logging functions matching bash UX @@ -122,6 +126,8 @@ $CLIENT_ENV = $env:CLIENT_ENV $GPU_VENDOR = "none" $NVIDIA_DRIVER_OK = $false $K3D_GPU_FLAG = "" +$ReadyTimeout = if ($env:READY_TIMEOUT) { $env:READY_TIMEOUT } else { "300" } +$script:ClientState = "starting" # ============================================================================= # HELP @@ -848,6 +854,39 @@ function Get-TraceblocYamlValue { return $val } +# Resolve the backend base URL the same way jobs-manager does +# (client-runtime/controller.py: CLIENT_ENV -> backend), defaulting to prod. +function Get-BackendUrl { + switch ($env:CLIENT_ENV) { + "dev" { return "https://dev-api.tracebloc.io/" } + "stg" { return "https://stg-api.tracebloc.io/" } + default { return "https://api.tracebloc.io/" } + } +} + +# Validate the entered Client ID / password against the backend's +# api-token-auth/ endpoint -- the same call jobs-manager makes at runtime. +# Returns: valid | invalid | inactive | unverified. +function Test-Credentials { + param([string]$ClientId, [string]$ClientPassword) + $backend = Get-BackendUrl + try { + $resp = Invoke-WebRequest -Uri "${backend}api-token-auth/" -Method Post ` + -Body @{ username = $ClientId; password = $ClientPassword } ` + -TimeoutSec 60 -UseBasicParsing -ErrorAction Stop + if ($resp.StatusCode -eq 200) { return "valid" } + return "unverified" + } catch { + $code = $null + if ($_.Exception.Response) { $code = [int]$_.Exception.Response.StatusCode } + switch ($code) { + 400 { return "invalid" } + 401 { return "inactive" } + default { return "unverified" } # 429 throttled, connection failure, 5xx, … + } + } +} + function Install-ClientHelm { # -- Step 3/4: Install tracebloc client -- Step 3 4 "Installing tracebloc client" @@ -887,6 +926,7 @@ function Install-ClientHelm { $nsInput = Read-Host " Workspace name [$defaultNamespace]" $rawName = if ($nsInput) { $nsInput } else { $defaultNamespace } $TB_NAMESPACE = ConvertTo-WorkspaceName -Input_ $rawName + $script:TB_NAMESPACE = $TB_NAMESPACE # share with Wait-ForClientReady / Print-Summary if ($TB_NAMESPACE -ne $rawName) { Info "Using workspace: $TB_NAMESPACE" @@ -903,28 +943,53 @@ function Install-ClientHelm { Write-Host " " -NoNewline; Write-Host "https://ai.tracebloc.io/clients" -ForegroundColor White Write-Host "" - if ($defaultClientId) { - $idInput = Read-Host " Client ID [$defaultClientId]" - $TB_CLIENT_ID = if ($idInput) { $idInput } else { $defaultClientId } - } else { - $TB_CLIENT_ID = Read-Host " Client ID" - } - if (-not $TB_CLIENT_ID) { Err "Client ID cannot be empty." } - - if ($defaultClientPassword) { - $pwInput = Read-Host " Client password [press Enter to keep existing]" -AsSecureString - if ($pwInput -and $pwInput.Length -gt 0) { + # Collect + verify credentials. The entered Client ID / password are checked + # against the backend (the same api-token-auth/ call jobs-manager makes) + # before we deploy, so a wrong credential is caught here -- with a re-prompt -- + # instead of surfacing later as a silently crash-looping pod. + $credAttempt = 0; $credMax = 5 + while ($true) { + if ($defaultClientId) { + $idInput = Read-Host " Client ID [$defaultClientId]" + $TB_CLIENT_ID = if ($idInput) { $idInput } else { $defaultClientId } + } else { + $TB_CLIENT_ID = Read-Host " Client ID" + } + if (-not $TB_CLIENT_ID) { Warn "Client ID cannot be empty."; continue } + + if ($defaultClientPassword) { + $pwInput = Read-Host " Client password [press Enter to keep existing]" -AsSecureString + if ($pwInput -and $pwInput.Length -gt 0) { + $BSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($pwInput) + try { $TB_CLIENT_PASSWORD = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($BSTR) } finally { [System.Runtime.InteropServices.Marshal]::ZeroFreeBSTR($BSTR) } + } else { + $TB_CLIENT_PASSWORD = $defaultClientPassword + } + } else { + $pwInput = Read-Host " Client password" -AsSecureString $BSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($pwInput) try { $TB_CLIENT_PASSWORD = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($BSTR) } finally { [System.Runtime.InteropServices.Marshal]::ZeroFreeBSTR($BSTR) } + } + if (-not $TB_CLIENT_PASSWORD) { Warn "Client password cannot be empty."; continue } + + Info "Verifying credentials with tracebloc..." + $credStatus = Test-Credentials -ClientId $TB_CLIENT_ID -ClientPassword $TB_CLIENT_PASSWORD + if ($credStatus -eq "valid") { Ok "Credentials verified."; break } + elseif ($credStatus -eq "inactive") { Err "This tracebloc account is not active yet. Check your email for the activation link, then re-run." } + elseif ($credStatus -eq "unverified") { + Warn "Couldn't reach tracebloc to verify your credentials right now - continuing." + Hint "If they are wrong, your client will stay offline at https://ai.tracebloc.io/clients after install." + break } else { - $TB_CLIENT_PASSWORD = $defaultClientPassword + Warn "That Client ID / password was rejected by tracebloc - please re-enter." + Hint "Find your credentials at https://ai.tracebloc.io/clients" } - } else { - $pwInput = Read-Host " Client password" -AsSecureString - $BSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($pwInput) - try { $TB_CLIENT_PASSWORD = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($BSTR) } finally { [System.Runtime.InteropServices.Marshal]::ZeroFreeBSTR($BSTR) } + + $credAttempt++ + if ($credAttempt -ge $credMax) { Err "Too many failed attempts. Double-check your credentials at https://ai.tracebloc.io/clients and re-run." } + # Force active re-entry on retry (don't silently reuse a rejected default). + $defaultClientId = ""; $defaultClientPassword = "" } - if (-not $TB_CLIENT_PASSWORD) { Err "Client password cannot be empty." } $passwordEscaped = $TB_CLIENT_PASSWORD -replace "'", "''" @@ -1013,46 +1078,117 @@ function Confirm-Cluster { Log $clusterInfo $nodes = kubectl get nodes -o wide 2>&1 | Out-String Log $nodes + $pods = kubectl get pods -n $script:TB_NAMESPACE -o wide 2>&1 | Out-String + Log $pods Log "--- End Cluster Status ---" } +# ── Readiness gate (#716) ───────────────────────────────────────────────── +# helm install only *applies* manifests; it does not wait for pods. Wait for the +# client's workloads to actually become Ready and set $script:ClientState so the +# summary reports the truth: connected | starting | bad_creds | image_pull | crash +function Wait-ForClientReady { + $ns = $script:TB_NAMESPACE + $deploys = @("mysql-client", "$ns-jobs-manager", "$ns-requests-proxy") + $deadline = (Get-Date).AddSeconds([int]$ReadyTimeout) + $allReady = $true + + Write-Host "" + Info "Waiting for the client to start - first run downloads images, this can take a few minutes..." + foreach ($d in $deploys) { + $remaining = [int]((New-TimeSpan -Start (Get-Date) -End $deadline).TotalSeconds) + if ($remaining -lt 10) { $remaining = 10 } + & kubectl rollout status "deployment/$d" -n $ns "--timeout=${remaining}s" 2>&1 | Out-Null + if ($LASTEXITCODE -eq 0) { + Ok ("{0} ready" -f ($d -replace "^$ns-", "")) + } else { + $allReady = $false + break + } + } + + Confirm-Cluster + if ($allReady) { $script:ClientState = "connected" } + else { $script:ClientState = (Get-NotReadyState -Namespace $ns) } +} + +# Classify why the client isn't Ready, for an accurate message. Returns a state. +function Get-NotReadyState { + param([string]$Namespace) + # Wrong credentials: jobs-manager authenticates to the backend on startup and + # crash-loops when rejected -- surfaced as an auth error in its logs. + $jmLogs = (& kubectl logs -n $Namespace "deployment/$Namespace-jobs-manager" --all-containers --tail=50 2>$null | Out-String) + if ($jmLogs -match '(?i)authentication failed|unable to log in') { return "bad_creds" } + $pods = (& kubectl get pods -n $Namespace 2>$null | Out-String) + if ($pods -match '(?i)ImagePullBackOff|ErrImagePull|InvalidImageName') { return "image_pull" } + if ($pods -match '(?i)CrashLoopBackOff') { return "crash" } + return "starting" +} + # ============================================================================= # SUMMARY # ============================================================================= +# Reports the outcome based on $script:ClientState (set by Wait-ForClientReady). +# The "secure compute environment / your data never leaves" claim is printed +# ONLY when the client is verifiably connected -- never on a partial/failed run. function Print-Summary { $mode = "CPU" if ($GPU_VENDOR -eq "nvidia" -and $NVIDIA_DRIVER_OK) { $mode = "NVIDIA GPU" } elseif ($GPU_VENDOR -eq "nvidia" -and -not $NVIDIA_DRIVER_OK) { $mode = "CPU (NVIDIA driver update needed)" } + $ns = $script:TB_NAMESPACE + $line = [string]([char]0x2501) * 46 Write-Host "" - Write-Host " " -NoNewline; Write-Host ([string]([char]0x2501) * 46) -ForegroundColor Green - Write-Host "" - Write-Host " " -NoNewline; Write-Host "tracebloc client installed successfully" -ForegroundColor Green - Write-Host "" - Write-Host " " -NoNewline; Write-Host "Workspace" -ForegroundColor White -NoNewline; Write-Host " : " -NoNewline; Write-Host $TB_NAMESPACE -ForegroundColor Cyan - Write-Host " " -NoNewline; Write-Host "Mode " -ForegroundColor White -NoNewline; Write-Host " : " -NoNewline; Write-Host $mode -ForegroundColor Cyan - Write-Host "" - Hint "This machine is now a secure compute environment" - Hint "on the tracebloc network. External AI vendors can" - Hint "submit models to be trained and evaluated here --" - Hint "your data never leaves your infrastructure." - Write-Host "" - Write-Host " What to do next" -ForegroundColor White - Write-Host "" - Write-Host " 1. " -NoNewline; Write-Host "Open the tracebloc dashboard" - Write-Host " " -NoNewline; Write-Host "https://ai.tracebloc.io" -ForegroundColor Cyan - Write-Host "" - Write-Host " 2. " -NoNewline; Write-Host "Ingest your training and test data" - Write-Host "" - Write-Host " 3. " -NoNewline; Write-Host "Define your first AI use case and" - Write-Host " invite vendors to submit models" - Write-Host "" - Hint "Need help? https://docs.tracebloc.io" - Hint "Logs: ~\.tracebloc\" - Hint "Data: /tracebloc/$TB_NAMESPACE" - Write-Host "" - Write-Host " " -NoNewline; Write-Host ([string]([char]0x2501) * 46) -ForegroundColor Green + switch ($script:ClientState) { + "connected" { + Write-Host " $line" -ForegroundColor Green + Write-Host "" + Write-Host " " -NoNewline; Write-Host "$([char]0x2714) Connected to tracebloc" -ForegroundColor Green + Write-Host "" + Write-Host " Workspace : " -NoNewline; Write-Host $ns -ForegroundColor Cyan + Write-Host " Mode : " -NoNewline; Write-Host $mode -ForegroundColor Cyan + Write-Host "" + Write-Host " Your client is live. Confirm it shows as Online:" + Write-Host " https://ai.tracebloc.io/clients" -ForegroundColor Cyan + Write-Host "" + Hint "Models that vendors submit train on this machine -- your data never leaves it." + Write-Host "" + Write-Host " What to do next" -ForegroundColor White + Write-Host " 1. Ingest your training and test data" + Write-Host " 2. Define your first AI use case and invite vendors" + Write-Host "" + Hint "Dashboard: https://ai.tracebloc.io Logs: ~\.tracebloc\ Data: /tracebloc/$ns" + Write-Host "" + Write-Host " $line" -ForegroundColor Green + } + "starting" { + Write-Host " " -NoNewline; Write-Host "$([char]0x26A0) Almost there - tracebloc is installed but still starting." -ForegroundColor Yellow + Write-Host "" + Write-Host " Components are still downloading/starting (first run can take a few minutes)." + Write-Host " Check progress: kubectl get pods -n $ns" -ForegroundColor Cyan + Write-Host "" + Write-Host " Your client will show as Online at https://ai.tracebloc.io/clients once it finishes." + Hint "Re-running this installer is safe." + } + "bad_creds" { + Write-Host " " -NoNewline; Write-Host "$([char]0x2716) Couldn't connect - your Client ID or password was rejected." -ForegroundColor Red + Write-Host "" + Write-Host " The environment installed, but tracebloc refused those credentials." + Write-Host " 1. Re-check them at https://ai.tracebloc.io/clients" -ForegroundColor Cyan + Write-Host " 2. Re-run this installer (safe to re-run)" + } + default { + $reason = "a component didn't start" + if ($script:ClientState -eq "image_pull") { $reason = "an image couldn't be pulled" } + if ($script:ClientState -eq "crash") { $reason = "a container is restarting (crash loop)" } + Write-Host " " -NoNewline; Write-Host "$([char]0x2716) Setup didn't finish - $reason." -ForegroundColor Red + Write-Host "" + Write-Host " Inspect: kubectl get pods -n $ns" -ForegroundColor Cyan + Write-Host " Logs: ~\.tracebloc\install-*.log" + Hint "Re-running this installer is safe." + } + } Write-Host "" # Advanced info for log only @@ -1078,6 +1214,8 @@ function Print-Summary { # MAIN # ============================================================================= +if (-not $env:TB_PESTER) { + if ($Help) { Print-Help } Confirm-Config @@ -1105,7 +1243,13 @@ Confirm-GpuNode # -- Steps 3/4 + 4/4 handled inside Install-ClientHelm -- Install-ClientHelm -Confirm-Cluster +# Verify the client actually came up before reporting anything +Wait-ForClientReady Print-Summary try { Stop-Transcript | Out-Null } catch {} + +# Exit code reflects reality: connected/starting are OK; failures are non-zero. +if ($script:ClientState -ne "connected" -and $script:ClientState -ne "starting") { exit 1 } + +} # end TB_PESTER guard (skipped when the test suite dot-sources this file) diff --git a/scripts/install-k8s.sh b/scripts/install-k8s.sh index a2dac2b..8568f79 100755 --- a/scripts/install-k8s.sh +++ b/scripts/install-k8s.sh @@ -83,8 +83,16 @@ main() { # ── Step 3/4 + 4/4 are handled inside install_client_helm ──────────────── install_client_helm - verify_cluster + # ── Verify the client actually came up before reporting anything ───────── + wait_for_client_ready print_summary + + # Exit code reflects reality: connected/starting are OK; failures are non-zero + # so re-runs and automation can tell the difference. + case "${CLIENT_STATE:-}" in + connected|starting) ;; + *) exit 1 ;; + esac } main "$@" diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh index de7db8a..9e1a55c 100755 --- a/scripts/lib/common.sh +++ b/scripts/lib/common.sh @@ -299,10 +299,14 @@ install_cleanup() { fi [[ -n "${LOG_FILE:-}" ]] && hint "Logs: $LOG_FILE" elif [[ $exit_code -ne 0 ]]; then - echo "" - warn "Installation did not complete." - [[ -n "${LOG_FILE:-}" ]] && hint "Check the install log: $LOG_FILE" - hint "This installer is safe to re-run — just try again." + # If print_summary already reported a specific outcome (CLIENT_STATE set), + # don't tack on a second, generic "did not complete" message. + if [[ -z "${CLIENT_STATE:-}" ]]; then + echo "" + warn "Installation did not complete." + [[ -n "${LOG_FILE:-}" ]] && hint "Check the install log: $LOG_FILE" + hint "This installer is safe to re-run — just try again." + fi fi } diff --git a/scripts/lib/install-client-helm.sh b/scripts/lib/install-client-helm.sh index 173a95a..2ffd5e2 100644 --- a/scripts/lib/install-client-helm.sh +++ b/scripts/lib/install-client-helm.sh @@ -110,6 +110,35 @@ _sanitize_workspace_name() { printf '%s' "$sanitized" } +# ── Credential verification (#717) ──────────────────────────────────────── +# Resolve the backend base URL the same way jobs-manager does +# (client-runtime/controller.py: CLIENT_ENV → backend), defaulting to prod. +_backend_url() { + case "${CLIENT_ENV:-prod}" in + dev) printf 'https://dev-api.tracebloc.io/' ;; + stg) printf 'https://stg-api.tracebloc.io/' ;; + *) printf 'https://api.tracebloc.io/' ;; + esac +} + +# Validate the entered Client ID / password against the backend's +# api-token-auth/ endpoint — the same call jobs-manager makes at runtime — +# using curl (already a dependency). Echoes: valid | invalid | inactive | unverified. +verify_credentials() { + local client_id="$1" client_password="$2" backend code + backend="$(_backend_url)" + code=$(curl -sS -m 60 -o /dev/null -w '%{http_code}' \ + --data-urlencode "username=${client_id}" \ + --data-urlencode "password=${client_password}" \ + "${backend}api-token-auth/" 2>/dev/null) || code="000" + case "$code" in + 200) printf 'valid' ;; + 400) printf 'invalid' ;; + 401) printf 'inactive' ;; + *) printf 'unverified' ;; # 429 throttled, 000 unreachable, 5xx, … + esac +} + install_client_helm() { # ── Step 3/4: Install tracebloc client ─────────────────────────────────── step 3 4 "Installing tracebloc client" @@ -175,25 +204,56 @@ install_client_helm() { echo -e " ${BOLD}${WHITE}https://ai.tracebloc.io/clients${RESET}" echo "" - if [[ -n "$default_client_id" ]]; then - read -r -p " Client ID [${default_client_id}]: " TB_CLIENT_ID_INPUT - TB_CLIENT_ID="${TB_CLIENT_ID_INPUT:-$default_client_id}" - else - read -r -p " Client ID: " TB_CLIENT_ID - fi - TB_CLIENT_ID=$(_sanitize_credential "$TB_CLIENT_ID") - [[ -z "$TB_CLIENT_ID" ]] && error "Client ID cannot be empty." - - if [[ -n "$default_client_password" ]]; then - read -r -s -p " Client password [press Enter to keep existing]: " TB_CLIENT_PASSWORD_INPUT - echo "" - TB_CLIENT_PASSWORD="${TB_CLIENT_PASSWORD_INPUT:-$default_client_password}" - else - read -r -s -p " Client password: " TB_CLIENT_PASSWORD - echo "" - fi - TB_CLIENT_PASSWORD=$(_sanitize_credential "$TB_CLIENT_PASSWORD") - [[ -z "$TB_CLIENT_PASSWORD" ]] && error "Client password cannot be empty." + # Collect + verify credentials. The entered Client ID / password are checked + # against the backend (the same api-token-auth/ call jobs-manager makes) + # before we deploy, so a wrong credential is caught here — with a re-prompt — + # instead of surfacing later as a silently crash-looping pod. + local _cred_attempt=0 _cred_max=5 _cred_status + while true; do + if [[ -n "$default_client_id" ]]; then + read -r -p " Client ID [${default_client_id}]: " TB_CLIENT_ID_INPUT + TB_CLIENT_ID="${TB_CLIENT_ID_INPUT:-$default_client_id}" + else + read -r -p " Client ID: " TB_CLIENT_ID + fi + TB_CLIENT_ID=$(_sanitize_credential "$TB_CLIENT_ID") + if [[ -z "$TB_CLIENT_ID" ]]; then warn "Client ID cannot be empty."; continue; fi + + if [[ -n "$default_client_password" ]]; then + read -r -s -p " Client password [press Enter to keep existing]: " TB_CLIENT_PASSWORD_INPUT + echo "" + TB_CLIENT_PASSWORD="${TB_CLIENT_PASSWORD_INPUT:-$default_client_password}" + else + read -r -s -p " Client password: " TB_CLIENT_PASSWORD + echo "" + fi + TB_CLIENT_PASSWORD=$(_sanitize_credential "$TB_CLIENT_PASSWORD") + if [[ -z "$TB_CLIENT_PASSWORD" ]]; then warn "Client password cannot be empty."; continue; fi + + info "Verifying credentials with tracebloc…" + _cred_status=$(verify_credentials "$TB_CLIENT_ID" "$TB_CLIENT_PASSWORD") + case "$_cred_status" in + valid) + success "Credentials verified." + break ;; + invalid) + warn "That Client ID / password was rejected by tracebloc — please re-enter." + hint "Find your credentials at https://ai.tracebloc.io/clients" ;; + inactive) + error "This tracebloc account is not active yet. Check your email for the activation link, then re-run." ;; + unverified) + warn "Couldn't reach tracebloc to verify your credentials right now — continuing." + hint "If they are wrong, your client will stay offline at https://ai.tracebloc.io/clients after install." + break ;; + esac + + _cred_attempt=$((_cred_attempt + 1)) + if [[ $_cred_attempt -ge $_cred_max ]]; then + error "Too many failed attempts. Double-check your credentials at https://ai.tracebloc.io/clients and re-run." + fi + # Force an active re-entry on retry (don't silently reuse a rejected default). + default_client_id=""; default_client_password="" + done TB_CLIENT_PASSWORD_ESCAPED="${TB_CLIENT_PASSWORD//\'/\'\'}" diff --git a/scripts/lib/setup-linux.sh b/scripts/lib/setup-linux.sh index a3dc8f4..c934124 100755 --- a/scripts/lib/setup-linux.sh +++ b/scripts/lib/setup-linux.sh @@ -24,6 +24,15 @@ install_docker_engine() { spin_cmd "Installing Docker…" sudo pacman -S --noconfirm docker elif has zypper; then spin_cmd "Installing Docker…" sudo zypper install -y docker + elif [[ -f /etc/os-release ]] && grep -qiE '^ID="?(almalinux|rocky|ol|oracle)"?' /etc/os-release; then + # get.docker.com rejects RHEL rebuilds (almalinux/rocky/ol) with + # "Unsupported distribution". Install docker-ce from Docker's official + # CentOS repo instead — it is RHEL-compatible and works on these distros. + spin_cmd "Installing Docker…" bash -c ' + set -e + sudo dnf -y -q install dnf-plugins-core + sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo + sudo dnf -y -q install docker-ce docker-ce-cli containerd.io' else local docker_script docker_script="$(mktemp)" @@ -54,9 +63,13 @@ install_docker_engine() { # ── System dependencies ───────────────────────────────────────────────────── install_system_deps() { + # conntrack binary ships under different package names per distro: + # Debian/Ubuntu (apt) → "conntrack"; RHEL/SUSE/Arch (dnf/yum/zypper/pacman) → "conntrack-tools" + local conntrack_pkg="conntrack-tools" + has apt-get && conntrack_pkg="conntrack" MISSING_PKGS=() has curl || MISSING_PKGS+=(curl) - has conntrack || MISSING_PKGS+=(conntrack-tools) + has conntrack || MISSING_PKGS+=("$conntrack_pkg") if [[ ${#MISSING_PKGS[@]} -gt 0 ]]; then spin_cmd "Updating package index…" $PM_UPDATE for pkg in "${MISSING_PKGS[@]}"; do @@ -107,7 +120,11 @@ install_k3d() { https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh -o "$k3d_script" chmod +x "$k3d_script" - if ! spin_cmd "Installing system tools…" sudo bash "$k3d_script"; then + # Preserve PATH through sudo: the k3d install script verifies itself with + # `command -v k3d` after copying the binary into /usr/local/bin. On RHEL-family + # distros sudo's secure_path excludes /usr/local/bin, so that check fails and + # the script aborts with "k3d not found". `sudo env PATH=$PATH` keeps it visible. + if ! spin_cmd "Installing system tools…" sudo env "PATH=$PATH" bash "$k3d_script"; then rm -f "$k3d_script" error "System tool installation failed. See the install log for details." fi diff --git a/scripts/lib/summary.sh b/scripts/lib/summary.sh index 16665ff..90e784b 100755 --- a/scripts/lib/summary.sh +++ b/scripts/lib/summary.sh @@ -3,47 +3,131 @@ # summary.sh — Final success screen + cluster verification (debug only) # ============================================================================= -verify_cluster() { +# Cluster status dump (debug log only). +_log_cluster_status() { log "--- Cluster Status ---" kubectl cluster-info >> "${LOG_FILE:-/dev/null}" 2>&1 || true kubectl get nodes -o wide >> "${LOG_FILE:-/dev/null}" 2>&1 || true - kubectl get pods -n "${TB_NAMESPACE:-default}" >> "${LOG_FILE:-/dev/null}" 2>&1 || true + kubectl get pods -n "${TB_NAMESPACE:-default}" -o wide >> "${LOG_FILE:-/dev/null}" 2>&1 || true log "--- End Cluster Status ---" } +# ── Readiness gate (#716) ───────────────────────────────────────────────── +# helm install only *applies* manifests; it does not wait for pods. After it +# returns we wait for the client's workloads to actually become Ready and set +# CLIENT_STATE so the summary reports the truth instead of an unconditional +# "installed successfully": +# connected | starting | bad_creds | image_pull | crash +CLIENT_STATE="starting" +READY_TIMEOUT="${READY_TIMEOUT:-300}" + +wait_for_client_ready() { + local ns="${TB_NAMESPACE:-default}" + local deploys=("mysql-client" "${ns}-jobs-manager" "${ns}-requests-proxy") + local deadline=$(( $(date +%s) + READY_TIMEOUT )) + local all_ready=true d remaining + + echo "" + info "Waiting for the client to start — first run downloads images, this can take a few minutes…" + for d in "${deploys[@]}"; do + remaining=$(( deadline - $(date +%s) )); (( remaining < 10 )) && remaining=10 + if kubectl rollout status "deployment/${d}" -n "$ns" --timeout="${remaining}s" \ + >> "${LOG_FILE:-/dev/null}" 2>&1; then + success "${d#${ns}-} ready" + else + all_ready=false; break + fi + done + + _log_cluster_status + if [[ "$all_ready" == true ]]; then + CLIENT_STATE="connected" + else + CLIENT_STATE="$(_diagnose_not_ready "$ns")" + fi + return 0 +} + +# Classify why the client isn't Ready, for an accurate message. Echoes a state. +_diagnose_not_ready() { + local ns="$1" pods jm_logs + # Wrong credentials: jobs-manager authenticates to the backend on startup and + # crash-loops when rejected — surfaced as an auth error in its logs. + jm_logs="$(kubectl logs -n "$ns" "deployment/${ns}-jobs-manager" --all-containers --tail=50 2>/dev/null || true)" + if printf '%s' "$jm_logs" | grep -qiE 'authentication failed|unable to log in'; then + printf 'bad_creds'; return + fi + pods="$(kubectl get pods -n "$ns" 2>/dev/null || true)" + if printf '%s' "$pods" | grep -qiE 'ImagePullBackOff|ErrImagePull|InvalidImageName'; then + printf 'image_pull'; return + fi + if printf '%s' "$pods" | grep -qiE 'CrashLoopBackOff'; then + printf 'crash'; return + fi + printf 'starting' +} + +# Reports the outcome based on CLIENT_STATE (set by wait_for_client_ready). +# The "secure compute environment / your data never leaves" claim is printed +# ONLY when the client is verifiably connected — never on a partial/failed run. print_summary() { local mode="CPU" [[ "$GPU_VENDOR" == "nvidia" ]] && mode="NVIDIA GPU" [[ "$GPU_VENDOR" == "amd" ]] && mode="AMD GPU" + local ns="${TB_NAMESPACE:-default}" + local line="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" - echo -e " ${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}" - echo "" - echo -e " ${BOLD}${GREEN}tracebloc client installed successfully${RESET}" - echo "" - echo -e " ${BOLD}Workspace${RESET} : ${CYAN}${TB_NAMESPACE:-default}${RESET}" - echo -e " ${BOLD}Mode${RESET} : ${CYAN}${mode}${RESET}" - echo "" - echo -e " ${DIM}This machine is now a secure compute environment${RESET}" - echo -e " ${DIM}on the tracebloc network. External AI vendors can${RESET}" - echo -e " ${DIM}submit models to be trained and evaluated here —${RESET}" - echo -e " ${DIM}your data never leaves your infrastructure.${RESET}" - echo "" - echo -e " ${BOLD}What to do next${RESET}" - echo "" - echo -e " ${WHITE}1.${RESET} Open the tracebloc dashboard" - echo -e " ${CYAN}https://ai.tracebloc.io${RESET}" - echo "" - echo -e " ${WHITE}2.${RESET} Ingest your training and test data" - echo "" - echo -e " ${WHITE}3.${RESET} Define your first AI use case and" - echo -e " invite vendors to submit models" - echo "" - echo -e " ${DIM}Need help?${RESET} ${CYAN}https://docs.tracebloc.io${RESET}" - echo -e " ${DIM}Logs:${RESET} ${DIM}~/.tracebloc/${RESET}" - echo -e " ${DIM}Data:${RESET} ${DIM}/tracebloc/${TB_NAMESPACE:-default}${RESET}" - echo "" - echo -e " ${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}" + case "$CLIENT_STATE" in + connected) + echo -e " ${GREEN}${line}${RESET}" + echo "" + echo -e " ${BOLD}${GREEN}✔ Connected to tracebloc${RESET}" + echo "" + echo -e " ${BOLD}Workspace${RESET} : ${CYAN}${ns}${RESET}" + echo -e " ${BOLD}Mode${RESET} : ${CYAN}${mode}${RESET}" + echo "" + echo -e " Your client is live. Confirm it shows as ${BOLD}🟢 Online${RESET}:" + echo -e " ${CYAN}https://ai.tracebloc.io/clients${RESET}" + echo "" + echo -e " ${DIM}Models that vendors submit train on this machine —${RESET}" + echo -e " ${DIM}your data never leaves it.${RESET}" + echo "" + echo -e " ${BOLD}What to do next${RESET}" + echo -e " ${WHITE}1.${RESET} Ingest your training and test data" + echo -e " ${WHITE}2.${RESET} Define your first AI use case and invite vendors" + echo "" + echo -e " ${DIM}Dashboard:${RESET} ${CYAN}https://ai.tracebloc.io${RESET} ${DIM}Logs:${RESET} ${DIM}~/.tracebloc/${RESET} ${DIM}Data:${RESET} ${DIM}/tracebloc/${ns}${RESET}" + echo "" + echo -e " ${GREEN}${line}${RESET}" + ;; + starting) + echo -e " ${YELLOW}⚠ Almost there — tracebloc is installed but still starting.${RESET}" + echo "" + echo -e " Components are still downloading/starting (first run can take a few minutes)." + echo -e " Check progress: ${CYAN}kubectl get pods -n ${ns}${RESET}" + echo "" + echo -e " Your client will show as ${BOLD}🟢 Online${RESET} at ${CYAN}https://ai.tracebloc.io/clients${RESET}" + echo -e " once it finishes. ${DIM}Re-running this installer is safe.${RESET}" + ;; + bad_creds) + echo -e " ${RED}${BOLD}✖ Couldn't connect — your Client ID or password was rejected.${RESET}" >&2 + echo "" + echo -e " The environment installed, but tracebloc refused those credentials." + echo -e " 1. Re-check them at ${CYAN}https://ai.tracebloc.io/clients${RESET}" + echo -e " 2. Re-run this installer ${DIM}(safe to re-run)${RESET}" + ;; + image_pull|crash) + local reason="a component didn't start" + [[ "$CLIENT_STATE" == "image_pull" ]] && reason="an image couldn't be pulled" + [[ "$CLIENT_STATE" == "crash" ]] && reason="a container is restarting (crash loop)" + echo -e " ${RED}${BOLD}✖ Setup didn't finish — ${reason}.${RESET}" >&2 + echo "" + echo -e " Inspect: ${CYAN}kubectl get pods -n ${ns}${RESET}" + echo -e " Logs: ${DIM}~/.tracebloc/install-*.log${RESET}" + echo -e " ${DIM}Re-running this installer is safe.${RESET}" + ;; + esac echo "" _log_advanced_info diff --git a/scripts/tests/common.bats b/scripts/tests/common.bats new file mode 100644 index 0000000..83758d2 --- /dev/null +++ b/scripts/tests/common.bats @@ -0,0 +1,93 @@ +#!/usr/bin/env bats +# Tests for scripts/lib/common.sh — config validation, the install_cleanup +# CLIENT_STATE guard (#716), retry, has. +load test_helper + +setup() { + load_lib +} + +# ── validate_config ──────────────────────────────────────────────────────── +@test "validate_config: valid config passes" { + HOME="$BATS_TEST_TMPDIR"; USER=tester + CLUSTER_NAME=tracebloc; SERVERS=1; AGENTS=1 + HOST_DATA_DIR="$HOME/.tracebloc" + run validate_config + [ "$status" -eq 0 ] +} + +@test "validate_config: invalid CLUSTER_NAME -> error" { + HOME="$BATS_TEST_TMPDIR"; USER=tester + CLUSTER_NAME="1nope"; SERVERS=1; AGENTS=1; HOST_DATA_DIR="$HOME/x" + run validate_config + [ "$status" -ne 0 ] + [[ "$output" == *"CLUSTER_NAME"* ]] +} + +@test "validate_config: invalid SERVERS -> error" { + HOME="$BATS_TEST_TMPDIR"; USER=tester + CLUSTER_NAME=ok; SERVERS=0; AGENTS=1; HOST_DATA_DIR="$HOME/x" + run validate_config + [ "$status" -ne 0 ] + [[ "$output" == *"SERVERS"* ]] +} + +@test "validate_config: HOST_DATA_DIR outside HOME -> error" { + HOME="$BATS_TEST_TMPDIR"; USER=tester + CLUSTER_NAME=ok; SERVERS=1; AGENTS=1; HOST_DATA_DIR="/tmp/not-under-home-$$" + run validate_config + [ "$status" -ne 0 ] + [[ "$output" == *"HOST_DATA_DIR"* ]] +} + +# ── install_cleanup: the CLIENT_STATE guard (#716) ───────────────────────── +@test "install_cleanup: exit 0 -> silent" { + out="$( ( exit 0 ); install_cleanup 2>&1 )" + [[ "$out" != *"did not complete"* ]] +} + +@test "install_cleanup: failure + CLIENT_STATE set -> suppresses generic message" { + CLIENT_STATE=connected + out="$( ( exit 1 ); install_cleanup 2>&1 )" + [[ "$out" != *"did not complete"* ]] +} + +@test "install_cleanup: failure + CLIENT_STATE unset -> shows generic message" { + unset CLIENT_STATE + out="$( ( exit 1 ); install_cleanup 2>&1 )" + [[ "$out" == *"did not complete"* ]] +} + +@test "install_cleanup: exit 2 -> re-run hint" { + unset CLIENT_STATE + out="$( ( exit 2 ); install_cleanup 2>&1 )" + [[ "$out" == *"Re-run required"* || "$out" == *"Complete the step"* ]] +} + +# ── retry ────────────────────────────────────────────────────────────────── +@test "retry: succeeds on first attempt" { + run retry 3 1 true + [ "$status" -eq 0 ] +} + +@test "retry: gives up after max attempts" { + run retry 2 0 false + [ "$status" -ne 0 ] +} + +@test "retry: succeeds after a transient failure" { + marker="$BATS_TEST_TMPDIR/m" + flaky() { if [ -f "$marker" ]; then return 0; fi; touch "$marker"; return 1; } + run retry 3 0 flaky + [ "$status" -eq 0 ] +} + +# ── has ──────────────────────────────────────────────────────────────────── +@test "has: present command" { run has bash; [ "$status" -eq 0 ]; } +@test "has: absent command" { run has nope-not-a-real-cmd-xyz; [ "$status" -ne 0 ]; } + +# ── check_docker_arch_mac (no-op off macOS) ──────────────────────────────── +@test "check_docker_arch_mac: no-op on non-macOS" { + run check_docker_arch_mac + [ "$status" -eq 0 ] +} diff --git a/scripts/tests/install-client-helm.bats b/scripts/tests/install-client-helm.bats new file mode 100644 index 0000000..739a96d --- /dev/null +++ b/scripts/tests/install-client-helm.bats @@ -0,0 +1,224 @@ +#!/usr/bin/env bats +# Tests for scripts/lib/install-client-helm.sh — credential verification (#717) +# + the install_client_helm flow. +load test_helper + +setup() { + load_lib install-client-helm.sh + MOCK_CALLS="$(mktemp)" + GPU_VENDOR=none + CLIENT_ENV="" +} + +# ── _backend_url ─────────────────────────────────────────────────────────── +@test "_backend_url: default (unset) -> prod" { + unset CLIENT_ENV + run _backend_url + [ "$output" = "https://api.tracebloc.io/" ] +} + +@test "_backend_url: dev" { + CLIENT_ENV=dev + run _backend_url + [ "$output" = "https://dev-api.tracebloc.io/" ] +} + +@test "_backend_url: stg" { + CLIENT_ENV=stg + run _backend_url + [ "$output" = "https://stg-api.tracebloc.io/" ] +} + +@test "_backend_url: unknown -> prod" { + CLIENT_ENV=whatever + run _backend_url + [ "$output" = "https://api.tracebloc.io/" ] +} + +# ── verify_credentials (mock curl's http_code on stdout) ─────────────────── +@test "verify_credentials: HTTP 200 -> valid" { + curl() { echo 200; } + run verify_credentials id pw + [ "$output" = valid ] +} + +@test "verify_credentials: HTTP 400 -> invalid" { + curl() { echo 400; } + run verify_credentials id pw + [ "$output" = invalid ] +} + +@test "verify_credentials: HTTP 401 -> inactive" { + curl() { echo 401; } + run verify_credentials id pw + [ "$output" = inactive ] +} + +@test "verify_credentials: HTTP 429 -> unverified" { + curl() { echo 429; } + run verify_credentials id pw + [ "$output" = unverified ] +} + +@test "verify_credentials: connection failure -> unverified" { + curl() { return 7; } + run verify_credentials id pw + [ "$output" = unverified ] +} + +# ── sanitizers ───────────────────────────────────────────────────────────── +@test "_strip_paste_garbage: unwraps bracketed-paste ESC markers" { + run _strip_paste_garbage "$(printf '\e[200~secret\e[201~')" + [ "$output" = "secret" ] +} + +@test "_strip_paste_garbage: strips C0 control chars, keeps text" { + run _strip_paste_garbage "$(printf 'ab\001cd')" + [ "$output" = "abcd" ] +} + +@test "_sanitize_workspace_name: lowercases + dashes" { + run _sanitize_workspace_name "My Team_1" + [ "$output" = "my-team-1" ] +} + +@test "_sanitize_workspace_name: all-invalid -> default" { + run _sanitize_workspace_name "@@@" + [ "$output" = "default" ] +} + +@test "_sanitize_workspace_name: collapses + trims dashes" { + run _sanitize_workspace_name "a--b-" + [ "$output" = "a-b" ] +} + +# ── _extract_yaml_value ──────────────────────────────────────────────────── +@test "_extract_yaml_value: double-quoted" { + f="$BATS_TEST_TMPDIR/v"; printf 'clientId: "abc-123"\n' >"$f" + run _extract_yaml_value "$f" clientId + [ "$output" = "abc-123" ] +} + +@test "_extract_yaml_value: single-quoted with '' escape" { + f="$BATS_TEST_TMPDIR/v"; printf "clientPassword: 'a''b'\n" >"$f" + run _extract_yaml_value "$f" clientPassword + [ "$output" = "a'b" ] +} + +@test "_extract_yaml_value: missing key -> empty" { + f="$BATS_TEST_TMPDIR/v"; printf 'other: x\n' >"$f" + run _extract_yaml_value "$f" clientId + [ "$output" = "" ] +} + +# ── _ensure_helm_runnable (happy path) ───────────────────────────────────── +@test "_ensure_helm_runnable: helm runs -> ok" { + helm() { return 0; } + run _ensure_helm_runnable + [ "$status" -eq 0 ] +} + +# ── install_client_helm: full flow with mocks ────────────────────────────── +@test "install_client_helm: valid creds -> writes values.yaml + runs helm" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { printf valid; } + run install_client_helm <<< $'myws\nmyid\nmypw' + [ "$status" -eq 0 ] + [[ "$output" == *"Credentials verified"* ]] + [[ "$output" == *"Connected to tracebloc"* ]] + grep -q 'clientId: "myid"' "$HOST_DATA_DIR/values.yaml" + grep -q "clientPassword: 'mypw'" "$HOST_DATA_DIR/values.yaml" + mock_calls | grep -q "helm upgrade --install myws" +} + +@test "install_client_helm: re-prompts on invalid, then accepts valid" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { + local n; n=$(cat "$BATS_TEST_TMPDIR/n" 2>/dev/null || echo 0); n=$((n+1)); echo "$n" >"$BATS_TEST_TMPDIR/n" + if [ "$n" -ge 2 ]; then printf valid; else printf invalid; fi + } + run install_client_helm <<< $'myws\nbadid\nbadpw\ngoodid\ngoodpw' + [ "$status" -eq 0 ] + [[ "$output" == *"rejected"* ]] + [[ "$output" == *"Credentials verified"* ]] + grep -q 'clientId: "goodid"' "$HOST_DATA_DIR/values.yaml" +} + +@test "install_client_helm: inactive account -> errors, no helm install" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { printf inactive; } + run install_client_helm <<< $'myws\nmyid\nmypw' + [ "$status" -ne 0 ] + [[ "$output" == *"not active"* ]] + run mock_calls + [[ "$output" != *"helm upgrade"* ]] +} + +@test "install_client_helm: unverified backend -> proceeds with install" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { printf unverified; } + run install_client_helm <<< $'myws\nmyid\nmypw' + [ "$status" -eq 0 ] + [[ "$output" == *"Couldn't reach tracebloc"* ]] + run mock_calls + [[ "$output" == *"helm upgrade --install"* ]] +} + +@test "install_client_helm: dev-mode uses caller values file, skips prompts" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + vf="$BATS_TEST_TMPDIR/v.yaml"; printf 'clientId: "x"\n' >"$vf" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + TRACEBLOC_VALUES_FILE="$vf"; TB_NAMESPACE=devns + run install_client_helm + [ "$status" -eq 0 ] + run mock_calls + [[ "$output" == *"helm upgrade --install devns"* ]] +} + +@test "install_client_helm: reuses previous clientId/password defaults" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + printf 'clientId: "previd"\nclientPassword: '"'"'prevpw'"'"'\n' >"$HOST_DATA_DIR/values.yaml" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { printf valid; } + # use-previous=y, workspace=myws, ClientID=Enter(keep previd), password=Enter(keep prevpw) + run install_client_helm <<< $'y\nmyws\n\n\n' + [ "$status" -eq 0 ] + grep -q 'clientId: "previd"' "$HOST_DATA_DIR/values.yaml" + grep -q "clientPassword: 'prevpw'" "$HOST_DATA_DIR/values.yaml" +} + +@test "install_client_helm: gives up after max failed attempts" { + HOST_DATA_DIR="$BATS_TEST_TMPDIR/data"; mkdir -p "$HOST_DATA_DIR" + _ensure_tracebloc_dirs() { :; } + _ensure_release_dirs() { :; } + _ensure_helm_runnable() { :; } + helm() { record "helm $*"; return 0; } + verify_credentials() { printf invalid; } + run install_client_helm <<< $'myws\ni1\np1\ni2\np2\ni3\np3\ni4\np4\ni5\np5' + [ "$status" -ne 0 ] + [[ "$output" == *"Too many failed attempts"* ]] + run mock_calls + [[ "$output" != *"helm upgrade"* ]] +} diff --git a/scripts/tests/install-k8s.Tests.ps1 b/scripts/tests/install-k8s.Tests.ps1 new file mode 100644 index 0000000..30877f7 --- /dev/null +++ b/scripts/tests/install-k8s.Tests.ps1 @@ -0,0 +1,232 @@ +# Pester tests for scripts/install-k8s.ps1 (Windows installer). +# Dot-sources the script with $env:TB_PESTER set so the admin gate + main() are +# skipped and only the functions load. Run: Invoke-Pester scripts/tests/ + +BeforeAll { + $env:TB_PESTER = "1" + . "$PSScriptRoot/../install-k8s.ps1" + # Stubs so Pester can mock external commands that the functions invoke. + function kubectl { } +} + +Describe "Get-BackendUrl" { + AfterEach { $env:CLIENT_ENV = $null } + It "defaults to prod when CLIENT_ENV is unset" { + $env:CLIENT_ENV = $null + Get-BackendUrl | Should -Be "https://api.tracebloc.io/" + } + It "dev" { $env:CLIENT_ENV = "dev"; Get-BackendUrl | Should -Be "https://dev-api.tracebloc.io/" } + It "stg" { $env:CLIENT_ENV = "stg"; Get-BackendUrl | Should -Be "https://stg-api.tracebloc.io/" } + It "unknown -> prod" { $env:CLIENT_ENV = "whatever"; Get-BackendUrl | Should -Be "https://api.tracebloc.io/" } +} + +Describe "Test-Credentials" { + It "HTTP 200 -> valid" { + Mock Invoke-WebRequest { [pscustomobject]@{ StatusCode = 200 } } + Test-Credentials -ClientId x -ClientPassword y | Should -Be "valid" + } + It "HTTP 400 -> invalid" { + Mock Invoke-WebRequest { + $resp = [pscustomobject]@{ StatusCode = 400 } + $ex = [System.Exception]::new("400"); $ex | Add-Member -NotePropertyName Response -NotePropertyValue $resp + throw $ex + } + Test-Credentials -ClientId x -ClientPassword y | Should -Be "invalid" + } + It "HTTP 401 -> inactive" { + Mock Invoke-WebRequest { + $resp = [pscustomobject]@{ StatusCode = 401 } + $ex = [System.Exception]::new("401"); $ex | Add-Member -NotePropertyName Response -NotePropertyValue $resp + throw $ex + } + Test-Credentials -ClientId x -ClientPassword y | Should -Be "inactive" + } + It "connection failure -> unverified" { + Mock Invoke-WebRequest { throw [System.Exception]::new("connection refused") } + Test-Credentials -ClientId x -ClientPassword y | Should -Be "unverified" + } + It "non-200 success -> unverified" { + Mock Invoke-WebRequest { [pscustomobject]@{ StatusCode = 204 } } + Test-Credentials -ClientId x -ClientPassword y | Should -Be "unverified" + } +} + +Describe "Get-NotReadyState" { + It "jobs-manager auth error -> bad_creds" { + Mock kubectl { if ($args -match 'logs') { "Authentication failed: Unable to log in" } else { "" } } + Get-NotReadyState -Namespace ns | Should -Be "bad_creds" + } + It "ImagePullBackOff -> image_pull" { + Mock kubectl { if ($args -match 'logs') { "booting" } else { "x 0/1 ImagePullBackOff" } } + Get-NotReadyState -Namespace ns | Should -Be "image_pull" + } + It "CrashLoopBackOff -> crash" { + Mock kubectl { if ($args -match 'logs') { "booting" } else { "x 0/1 CrashLoopBackOff" } } + Get-NotReadyState -Namespace ns | Should -Be "crash" + } + It "still creating -> starting" { + Mock kubectl { if ($args -match 'logs') { "booting" } else { "x 0/1 ContainerCreating" } } + Get-NotReadyState -Namespace ns | Should -Be "starting" + } +} + +Describe "Print-Summary" { + BeforeEach { $script:TB_NAMESPACE = "ns"; $GPU_VENDOR = "none"; $NVIDIA_DRIVER_OK = $false } + It "connected: Connected + trust claim" { + $script:ClientState = "connected" + $out = Print-Summary 6>&1 | Out-String + $out | Should -Match "Connected to tracebloc" + $out | Should -Match "data never leaves" + } + It "starting: still starting, no trust claim" { + $script:ClientState = "starting" + $out = Print-Summary 6>&1 | Out-String + $out | Should -Match "still starting" + $out | Should -Not -Match "data never leaves" + } + It "bad_creds: rejected, no trust claim" { + $script:ClientState = "bad_creds" + $out = Print-Summary 6>&1 | Out-String + $out | Should -Match "rejected" + $out | Should -Not -Match "data never leaves" + } + It "crash: crash-loop message" { + $script:ClientState = "crash" + $out = Print-Summary 6>&1 | Out-String + $out | Should -Match "crash loop" + } +} + +Describe "ConvertTo-WorkspaceName" { + It "lowercases + dashes spaces/underscores" { ConvertTo-WorkspaceName -Input_ "My Team_1" | Should -Be "my-team-1" } + It "all-invalid -> default" { ConvertTo-WorkspaceName -Input_ "@@@" | Should -Be "default" } +} + +Describe "Get-WindowsArch" { + AfterEach { $env:PROCESSOR_ARCHITECTURE = "AMD64" } + It "AMD64 -> amd64" { $env:PROCESSOR_ARCHITECTURE = "AMD64"; Get-WindowsArch | Should -Be "amd64" } + It "ARM64 -> arm64" { $env:PROCESSOR_ARCHITECTURE = "ARM64"; Get-WindowsArch | Should -Be "arm64" } + It "unknown -> Err" { + Mock Err { throw "err" } + $env:PROCESSOR_ARCHITECTURE = "sparc" + { Get-WindowsArch } | Should -Throw + } +} + +Describe "Confirm-Config" { + It "valid config passes + sets HOST_DATA_DIR" { + $env:USERPROFILE = $env:HOME + $CLUSTER_NAME = "tracebloc"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = "$env:HOME/.tracebloc" + { Confirm-Config } | Should -Not -Throw + } + It "invalid CLUSTER_NAME -> Err" { + Mock Err { throw "err" } + $env:USERPROFILE = $env:HOME + $CLUSTER_NAME = "1bad"; $SERVERS = "1"; $AGENTS = "1"; $HOST_DATA_DIR = "$env:HOME/x" + { Confirm-Config } | Should -Throw + } +} + +Describe "Wait-ForClientReady" { + BeforeEach { $script:TB_NAMESPACE = "ns"; $ReadyTimeout = "20" } + It "all rollouts ready -> connected" { + Mock kubectl { $global:LASTEXITCODE = 0 } + Mock Confirm-Cluster { } + Wait-ForClientReady + $script:ClientState | Should -Be "connected" + } + It "a rollout fails -> diagnosed (bad_creds)" { + Mock kubectl { + if ($args -match 'rollout') { $global:LASTEXITCODE = 1; return } + $global:LASTEXITCODE = 0 + if ($args -match 'logs') { return "Authentication failed: Unable to log in" } + return "x 0/1 CrashLoopBackOff" + } + Mock Confirm-Cluster { } + Wait-ForClientReady + $script:ClientState | Should -Be "bad_creds" + } +} + +Describe "Install-ClientHelm" { + BeforeEach { + $GPU_VENDOR = "none"; $NVIDIA_DRIVER_OK = $false; $env:CLIENT_ENV = $null + Mock helm { $global:LASTEXITCODE = 0 } + } + It "valid creds: writes values.yaml + runs helm" { + $HOST_DATA_DIR = "$TestDrive/d1" + Mock Read-Host { + param([string]$Prompt, [switch]$AsSecureString) + if ($Prompt -match 'password') { return (ConvertTo-SecureString "mypw" -AsPlainText -Force) } + if ($Prompt -match 'Workspace') { return "myws" } + if ($Prompt -match 'Client ID') { return "myid" } + return "" + } + Mock Test-Credentials { "valid" } + Install-ClientHelm + (Get-Content "$HOST_DATA_DIR/values.yaml" -Raw) | Should -Match 'clientId: "myid"' + # NB: the SecureString->plaintext path runs, but PtrToStringAuto only decodes + # correctly on Windows; assert the key is written, not the macOS-decoded value. + (Get-Content "$HOST_DATA_DIR/values.yaml" -Raw) | Should -Match "clientPassword:" + Should -Invoke helm -ParameterFilter { $args -contains "upgrade" } + } + It "CLIENT_ENV=dev is written into the values" { + $HOST_DATA_DIR = "$TestDrive/d1b"; $CLIENT_ENV = "dev" + Mock Read-Host { + param([string]$Prompt, [switch]$AsSecureString) + if ($Prompt -match 'password') { return (ConvertTo-SecureString "pw" -AsPlainText -Force) } + if ($Prompt -match 'Workspace') { return "ws" } + return "id" + } + Mock Test-Credentials { "valid" } + Install-ClientHelm + (Get-Content "$HOST_DATA_DIR/values.yaml" -Raw) | Should -Match 'CLIENT_ENV: dev' + } + It "re-prompts on invalid, then accepts valid" { + $HOST_DATA_DIR = "$TestDrive/d2"; $script:vc = 0 + Mock Read-Host { + param([string]$Prompt, [switch]$AsSecureString) + if ($Prompt -match 'password') { return (ConvertTo-SecureString "pw" -AsPlainText -Force) } + if ($Prompt -match 'Workspace') { return "ws" } + return "id" + } + Mock Test-Credentials { $script:vc++; if ($script:vc -ge 2) { "valid" } else { "invalid" } } + Install-ClientHelm + Should -Invoke Test-Credentials -Times 2 + Should -Invoke helm -ParameterFilter { $args -contains "upgrade" } + } + It "unverified backend -> proceeds with install" { + $HOST_DATA_DIR = "$TestDrive/d3" + Mock Read-Host { + param([string]$Prompt, [switch]$AsSecureString) + if ($Prompt -match 'password') { return (ConvertTo-SecureString "pw" -AsPlainText -Force) } + if ($Prompt -match 'Workspace') { return "ws" } + return "id" + } + Mock Test-Credentials { "unverified" } + Install-ClientHelm + Should -Invoke helm -ParameterFilter { $args -contains "upgrade" } + } + It "reuses previous clientId/password defaults" { + $HOST_DATA_DIR = "$TestDrive/d4"; New-Item -ItemType Directory -Path $HOST_DATA_DIR -Force | Out-Null + Set-Content "$HOST_DATA_DIR/values.yaml" "clientId: `"previd`"`nclientPassword: 'prevpw'" + Mock Read-Host { + param([string]$Prompt, [switch]$AsSecureString) + if ($Prompt -match 'previous') { return "y" } + if ($Prompt -match 'password') { return (ConvertTo-SecureString "newpw" -AsPlainText -Force) } + if ($Prompt -match 'Workspace') { return "ws" } + return "" # Client ID -> Enter keeps the previous default (previd) + } + Mock Test-Credentials { "valid" } + Install-ClientHelm + (Get-Content "$HOST_DATA_DIR/values.yaml" -Raw) | Should -Match 'clientId: "previd"' + } +} + +Describe "Confirm-Cluster" { + It "dumps cluster status without error" { + $script:TB_NAMESPACE = "ns"; $script:LOG_FILE = "$TestDrive/log.txt" + Mock kubectl { "info" } + { Confirm-Cluster } | Should -Not -Throw + } +} diff --git a/scripts/tests/setup-linux.bats b/scripts/tests/setup-linux.bats new file mode 100644 index 0000000..86b6548 --- /dev/null +++ b/scripts/tests/setup-linux.bats @@ -0,0 +1,137 @@ +#!/usr/bin/env bats +# Tests for scripts/lib/setup-linux.sh — RHEL Docker (#719), k3d secure_path +# (#718), conntrack package name (#720), package-manager detection. +load test_helper + +setup() { + load_lib setup-linux.sh + MOCK_CALLS="$(mktemp)" + PRESENT_CMDS="curl conntrack" + TEST_DISTRO=ubuntu + USER=testuser + PM_UPDATE="pmupdate"; PM_INSTALL="pminstall" + + has() { case " $PRESENT_CMDS " in *" $1 "*) return 0 ;; *) return 1 ;; esac; } + spin_cmd() { record "$*"; return 0; } + sudo() { record "sudo $*"; return 0; } + systemctl() { return 0; } + usermod() { return 0; } + docker() { return 0; } # `docker info` succeeds → skip the sg-docker re-exec + id() { echo "testuser docker"; } + curl() { record "curl $*"; return 0; } + # Simulate /etc/os-release matching per TEST_DISTRO; delegate other greps. + grep() { + if [[ "$*" == *"/etc/os-release"* ]]; then + case "$TEST_DISTRO" in + amzn) [[ "$*" == *amzn* ]] ;; + alma) [[ "$*" == *almalinux* ]] ;; + *) return 1 ;; + esac + return + fi + command grep "$@" + } +} + +# ── setup_pm ─────────────────────────────────────────────────────────────── +@test "setup_pm: apt-get detected" { + PRESENT_CMDS="apt-get" + setup_pm + [[ "$PM_INSTALL" == *"apt-get install"* ]] +} +@test "setup_pm: dnf detected" { + PRESENT_CMDS="dnf" + setup_pm + [[ "$PM_INSTALL" == *"dnf install"* ]] +} +@test "setup_pm: none -> error" { + PRESENT_CMDS="" + run setup_pm + [ "$status" -ne 0 ] + [[ "$output" == *"No supported package manager"* ]] +} + +# ── install_system_deps: conntrack package name (#720) ───────────────────── +@test "install_system_deps: apt uses 'conntrack'" { + PRESENT_CMDS="apt-get curl" # apt present, conntrack binary absent + run install_system_deps + run mock_calls + [[ "$output" == *"conntrack"* ]] + [[ "$output" != *"conntrack-tools"* ]] +} +@test "install_system_deps: dnf uses 'conntrack-tools'" { + PRESENT_CMDS="dnf curl" # no apt-get, conntrack binary absent + run install_system_deps + run mock_calls + [[ "$output" == *"conntrack-tools"* ]] +} +@test "install_system_deps: conntrack present -> not installed" { + PRESENT_CMDS="apt-get curl conntrack" + run install_system_deps + run mock_calls + [[ "$output" != *"Installing conntrack"* ]] +} + +# ── install_docker_engine: branch selection ──────────────────────────────── +@test "install_docker_engine: Amazon Linux -> dnf docker" { + PRESENT_CMDS="dnf"; TEST_DISTRO=amzn + run install_docker_engine + run mock_calls + [[ "$output" == *"dnf install -y docker"* ]] +} +@test "install_docker_engine: Arch -> pacman docker" { + PRESENT_CMDS="pacman"; TEST_DISTRO=ubuntu + run install_docker_engine + run mock_calls + [[ "$output" == *"pacman -S --noconfirm docker"* ]] +} +@test "install_docker_engine: SUSE -> zypper docker" { + PRESENT_CMDS="zypper"; TEST_DISTRO=ubuntu + run install_docker_engine + run mock_calls + [[ "$output" == *"zypper install -y docker"* ]] +} +@test "install_docker_engine: RHEL clone (#719) -> docker-ce dnf repo" { + PRESENT_CMDS=""; TEST_DISTRO=alma + run install_docker_engine + run mock_calls + [[ "$output" == *"docker-ce.repo"* ]] + [[ "$output" == *"docker-ce docker-ce-cli containerd.io"* ]] +} +@test "install_docker_engine: Debian/Ubuntu -> get.docker.com" { + PRESENT_CMDS="curl"; TEST_DISTRO=ubuntu + run install_docker_engine + run mock_calls + [[ "$output" == *"get.docker.com"* ]] +} +@test "install_docker_engine: docker already present -> no install" { + PRESENT_CMDS="docker"; TEST_DISTRO=ubuntu + run install_docker_engine + run mock_calls + [[ "$output" != *"get.docker.com"* ]] + [[ "$output" != *"docker-ce.repo"* ]] +} + +# ── install_k3d: PATH preserved through sudo (#718) ──────────────────────── +@test "install_k3d: installs via 'sudo env PATH=' (#718)" { + PRESENT_CMDS="curl" + has() { + if [ "$1" = k3d ]; then [ -f "$BATS_TEST_TMPDIR/k3di" ] + else case " $PRESENT_CMDS " in *" $1 "*) return 0 ;; *) return 1 ;; esac; fi + } + spin_cmd() { record "$*"; touch "$BATS_TEST_TMPDIR/k3di"; return 0; } + run install_k3d + [ "$status" -eq 0 ] + run mock_calls + [[ "$output" == *"sudo env"* ]] + [[ "$output" == *"PATH="* ]] + [[ "$output" == *"bash"* ]] +} +@test "install_k3d: already present -> skip" { + has() { [ "$1" = k3d ]; } + spin_cmd() { record "$*"; return 0; } + run install_k3d + [ "$status" -eq 0 ] + run mock_calls + [ -z "$output" ] +} diff --git a/scripts/tests/summary.bats b/scripts/tests/summary.bats new file mode 100644 index 0000000..e2f4e47 --- /dev/null +++ b/scripts/tests/summary.bats @@ -0,0 +1,93 @@ +#!/usr/bin/env bats +# Tests for scripts/lib/summary.sh — readiness gate + state-branched summary (#716) +load test_helper + +setup() { + load_lib summary.sh + TB_NAMESPACE=testns + GPU_VENDOR=none +} + +# ── _diagnose_not_ready ──────────────────────────────────────────────────── +@test "_diagnose_not_ready: jobs-manager auth error -> bad_creds" { + kubectl() { case "$*" in *logs*) echo "Exception: Authentication failed: Unable to log in with provided credentials";; *) echo "x 0/2 CrashLoopBackOff";; esac; } + run _diagnose_not_ready testns + [ "$output" = "bad_creds" ] +} + +@test "_diagnose_not_ready: ImagePullBackOff -> image_pull" { + kubectl() { case "$*" in *logs*) echo "booting";; *) echo "x 0/1 ImagePullBackOff";; esac; } + run _diagnose_not_ready testns + [ "$output" = "image_pull" ] +} + +@test "_diagnose_not_ready: CrashLoopBackOff (no auth err) -> crash" { + kubectl() { case "$*" in *logs*) echo "booting";; *) echo "x 0/1 CrashLoopBackOff";; esac; } + run _diagnose_not_ready testns + [ "$output" = "crash" ] +} + +@test "_diagnose_not_ready: still creating -> starting" { + kubectl() { case "$*" in *logs*) echo "booting";; *) echo "x 0/1 ContainerCreating";; esac; } + run _diagnose_not_ready testns + [ "$output" = "starting" ] +} + +# ── wait_for_client_ready ────────────────────────────────────────────────── +@test "wait_for_client_ready: all rollouts succeed -> connected" { + kubectl() { case "$*" in *"rollout status"*) return 0;; *) echo "";; esac; } + READY_TIMEOUT=20 + CLIENT_STATE="" + wait_for_client_ready + [ "$CLIENT_STATE" = "connected" ] +} + +@test "wait_for_client_ready: a rollout fails -> diagnosed (bad_creds)" { + kubectl() { + case "$*" in + *"rollout status"*) return 1 ;; + *logs*) echo "Authentication failed: Unable to log in" ;; + *) echo "x 0/2 CrashLoopBackOff" ;; + esac + } + READY_TIMEOUT=20 + CLIENT_STATE="" + wait_for_client_ready + [ "$CLIENT_STATE" = "bad_creds" ] +} + +# ── print_summary: the trust claim must appear ONLY when connected ───────── +@test "print_summary connected: Connected + trust claim" { + CLIENT_STATE=connected + run print_summary + [[ "$output" == *"Connected to tracebloc"* ]] + [[ "$output" == *"data never leaves"* ]] +} + +@test "print_summary starting: 'still starting', no trust claim" { + CLIENT_STATE=starting + run print_summary + [[ "$output" == *"still starting"* ]] + [[ "$output" != *"data never leaves"* ]] +} + +@test "print_summary bad_creds: 'rejected', no trust claim" { + CLIENT_STATE=bad_creds + run print_summary + [[ "$output" == *"rejected"* ]] + [[ "$output" != *"data never leaves"* ]] +} + +@test "print_summary image_pull: image message, no trust claim" { + CLIENT_STATE=image_pull + run print_summary + [[ "$output" == *"image couldn't be pulled"* ]] + [[ "$output" != *"data never leaves"* ]] +} + +@test "print_summary crash: crash-loop message" { + CLIENT_STATE=crash + run print_summary + [[ "$output" == *"crash loop"* ]] + [[ "$output" != *"data never leaves"* ]] +} diff --git a/scripts/tests/test_helper.bash b/scripts/tests/test_helper.bash new file mode 100644 index 0000000..0d1cc2f --- /dev/null +++ b/scripts/tests/test_helper.bash @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Shared helpers + mock scaffolding for the installer bats suite. +# The installer libs are side-effect-safe to `source` (no top-level install +# logic); only install-k8s.sh runs main(), which the tests never source. + +LIB_DIR="${BATS_TEST_DIRNAME}/../lib" +SCRIPTS_DIR="${BATS_TEST_DIRNAME}/.." + +# Source common.sh (logging helpers, colours, has/retry) + an optional target lib. +load_lib() { + # shellcheck source=/dev/null + source "${LIB_DIR}/common.sh" + if [ -n "${1:-}" ]; then + # shellcheck source=/dev/null + source "${LIB_DIR}/$1" + fi + LOG_FILE=/dev/null # make log() a silent sink during tests +} + +# Record a mock invocation (one line per call) for later assertions. +record() { printf '%s\n' "$*" >>"${MOCK_CALLS:-/dev/null}"; } +mock_calls() { cat "${MOCK_CALLS:-/dev/null}" 2>/dev/null; }