Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 117 additions & 25 deletions scripts/install-k8s.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -354,22 +354,36 @@ function Enable-VirtualisationFeatures {
Log "Updating WSL..."
$wslJob = Start-Job -ScriptBlock { cmd /c "wsl --update 2>&1" }
Write-Host -NoNewline " "
while ($wslJob.State -eq "Running") {
$wslTimeoutSec = 90
$wslElapsed = 0
while ($wslJob.State -eq "Running" -and $wslElapsed -lt $wslTimeoutSec) {
Write-Host -NoNewline "." -ForegroundColor DarkGray
Start-Sleep -Seconds 2
$wslElapsed += 2
}
Write-Host ""
$wslUpdate = Receive-Job -Job $wslJob
$wslExitOk = $wslJob.State -eq "Completed"
if ($wslJob.State -eq "Running") {
Stop-Job $wslJob
Log "WSL update timed out after ${wslTimeoutSec}s -- skipping."
Warn "WSL update is taking too long. Skipping for now."
Hint "Run 'wsl --update' manually after installation."
} else {
$wslUpdate = Receive-Job -Job $wslJob
$wslExitOk = $wslJob.State -eq "Completed"
if (-not $wslExitOk) { Log "WSL update may not have completed cleanly." }
}
Remove-Job -Job $wslJob -Force
if (-not $wslExitOk) { Log "WSL update may not have completed cleanly." }

$wslSet = cmd /c "wsl --set-default-version 2 2>&1"
if ($LASTEXITCODE -eq 0) {
$wslSetJob = Start-Job -ScriptBlock { cmd /c "wsl --set-default-version 2 2>&1" }
$wslSetDone = $wslSetJob | Wait-Job -Timeout 20
if ($wslSetDone) {
Receive-Job $wslSetJob | Out-Null
Remove-Job $wslSetJob -Force
Log "WSL2 set as default."
} else {
Stop-Job $wslSetJob; Remove-Job $wslSetJob -Force
Warn "Could not set WSL2 as default."
Hint "Try running 'wsl --update' manually, then re-run this script."
Hint "Try running 'wsl --set-default-version 2' manually."
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WSL set-default-version success check ignores command exit code

Medium Severity

The old code verified wsl --set-default-version 2 succeeded by checking $LASTEXITCODE -eq 0. The new code only checks whether the background job completed within the timeout via Wait-Job, but Wait-Job returns the job object regardless of whether the command inside exited with a non-zero code — the job state is "Completed" either way. So if the WSL command fails, the script still logs "WSL2 set as default." and the user never sees the warning or the hint to run it manually.

Fix in Cursor Fix in Web

}
}

Expand Down Expand Up @@ -469,10 +483,23 @@ function Install-NvidiaContainerToolkit {

Log "Setting up NVIDIA container toolkit in WSL2"

$prevEncoding = [Console]::OutputEncoding
[Console]::OutputEncoding = [System.Text.Encoding]::Unicode
$distroRaw = wsl --list --quiet 2>$null
[Console]::OutputEncoding = $prevEncoding
$wslListJob = Start-Job -ScriptBlock {
$prevEncoding = [Console]::OutputEncoding
[Console]::OutputEncoding = [System.Text.Encoding]::Unicode
$raw = wsl --list --quiet 2>$null
[Console]::OutputEncoding = $prevEncoding
return $raw
}
$wslListDone = $wslListJob | Wait-Job -Timeout 30
if (-not $wslListDone) {
Stop-Job $wslListJob; Remove-Job $wslListJob -Force
Warn "WSL did not respond in time. Skipping GPU container toolkit."
Hint "Run 'wsl --update' manually, then re-run this script for GPU support."
return
}
$distroRaw = Receive-Job $wslListJob
Remove-Job $wslListJob -Force

$distros = @($distroRaw | ForEach-Object { "$_".Trim() } | Where-Object { $_ -ne '' -and $_ -match '^\w' })
$wslDistro = ($distros | Where-Object { $_ -match 'Ubuntu' } | Select-Object -First 1)
if (-not $wslDistro -and $distros.Count -gt 0) { $wslDistro = $distros[0] }
Expand All @@ -481,9 +508,10 @@ function Install-NvidiaContainerToolkit {
Log "No WSL2 distro found -- installing Ubuntu..."
cmd /c "wsl --install -d Ubuntu --no-launch 2>&1" | Out-Null
cmd /c "wsl --setdefault Ubuntu 2>&1" | Out-Null
Warn "Ubuntu WSL2 installed."
Hint "Complete first-run setup: open Ubuntu from Start Menu, set a username/password."
Err "Please complete WSL2 Ubuntu setup first, then re-run."
Warn "Ubuntu WSL2 installed but needs first-run setup."
Hint "Open Ubuntu from the Start Menu and set a username/password."
Hint "Then re-run this script for GPU support."
return
}

Log "Using WSL2 distro: $wslDistro"
Expand All @@ -508,14 +536,40 @@ echo "NCT installed successfully."
[System.IO.File]::WriteAllText($scriptPath, $nctScript.Replace("`r`n", "`n"))
$wslPath = "/mnt/" + ($scriptPath -replace '\\','/' -replace '^([A-Za-z]):/', { $_.Groups[1].Value.ToLower() + '/' })

cmd /c "wsl -d $wslDistro -- /bin/bash `"$wslPath`" 2>&1"
$nctInstallJob = Start-Job -ScriptBlock {
param($d, $p)
cmd /c "wsl -d $d -- /bin/bash `"$p`" 2>&1"
} -ArgumentList $wslDistro, $wslPath

$nctDone = $nctInstallJob | Wait-Job -Timeout 180
if (-not $nctDone) {
Stop-Job $nctInstallJob; Remove-Job $nctInstallJob -Force
Remove-Item $scriptPath -Force -ErrorAction SilentlyContinue
Warn "GPU container toolkit installation timed out."
Hint "You can set it up manually inside WSL later."
return
}
Receive-Job $nctInstallJob | Out-Null
Remove-Job $nctInstallJob -Force
Remove-Item $scriptPath -Force -ErrorAction SilentlyContinue

$nctVer = cmd /c "wsl -d $wslDistro -- nvidia-ctk --version 2>&1"
if ($LASTEXITCODE -eq 0) {
Log "NVIDIA Container Toolkit in WSL2: $nctVer"
$script:K3D_GPU_FLAG = "--gpus=all"
$verJob = Start-Job -ScriptBlock {
param($d)
cmd /c "wsl -d $d -- nvidia-ctk --version 2>&1"
} -ArgumentList $wslDistro

$verDone = $verJob | Wait-Job -Timeout 15
if ($verDone) {
$nctVer = (Receive-Job $verJob | Out-String).Trim()
Remove-Job $verJob -Force
if ($nctVer -and $nctVer -notmatch 'error|not found') {
Log "NVIDIA Container Toolkit in WSL2: $nctVer"
$script:K3D_GPU_FLAG = "--gpus=all"
} else {
Warn "GPU setup may need manual attention."
}
} else {
Stop-Job $verJob; Remove-Job $verJob -Force
Warn "GPU setup may need manual attention."
}
}
Expand Down Expand Up @@ -679,14 +733,51 @@ function New-K3dCluster {
}

Log "Creating cluster: $SERVERS server(s) + $AGENTS agent(s)..."
Hint "First run may take 1-2 minutes to download components."
Hint "First run may take a few minutes to download components."

$k3dExe = (Get-Command k3d -ErrorAction SilentlyContinue).Source
if (-not $k3dExe) { $k3dExe = "k3d" }
$k3dArgString = ($k3dArgs | ForEach-Object {
if ($_ -match '[\s@]') { "`"$_`"" } else { $_ }
}) -join " "
$k3dOutLog = Join-Path $env:TEMP "k3d-create-$(Get-Random).log"
$k3dErrLog = Join-Path $env:TEMP "k3d-create-err-$(Get-Random).log"

$k3dProc = Start-Process -FilePath $k3dExe -ArgumentList $k3dArgString `
-NoNewWindow -PassThru `
-RedirectStandardOutput $k3dOutLog `
-RedirectStandardError $k3dErrLog

& k3d $k3dArgs
if ($LASTEXITCODE -ne 0) { Err "Failed to create compute environment." }
$frames = @([char]0x2807, [char]0x2819, [char]0x2839, [char]0x2838, [char]0x283C, [char]0x2834, [char]0x2826, [char]0x2827, [char]0x2847, [char]0x280F)
$f = 0
Write-Host -NoNewline " "
while (-not $k3dProc.HasExited) {
Write-Host "`r " -NoNewline
Write-Host $frames[$f] -ForegroundColor Cyan -NoNewline
Write-Host " Creating compute environment..." -NoNewline
$f = ($f + 1) % $frames.Count
Start-Sleep -Seconds 2
}
Write-Host "`r `r" -NoNewline

$k3dExitCode = $k3dProc.ExitCode
$k3dStdout = if (Test-Path $k3dOutLog) { Get-Content $k3dOutLog -Raw -ErrorAction SilentlyContinue } else { "" }
$k3dStderr = if (Test-Path $k3dErrLog) { Get-Content $k3dErrLog -Raw -ErrorAction SilentlyContinue } else { "" }
Remove-Item $k3dOutLog, $k3dErrLog -Force -ErrorAction SilentlyContinue
if ($k3dStdout) { Log "k3d stdout: $k3dStdout" }
if ($k3dStderr) { Log "k3d stderr: $k3dStderr" }

if ($k3dExitCode -ne 0) { Err "Failed to create compute environment." }
Ok "Compute environment ready."
}

k3d kubeconfig merge $CLUSTER_NAME --kubeconfig-switch-context | Out-Null

$kubeConfigPath = "$env:USERPROFILE\.kube\config"
if (Test-Path $kubeConfigPath) {
(Get-Content $kubeConfigPath) -replace 'host\.docker\.internal', '127.0.0.1' | Set-Content $kubeConfigPath -Encoding UTF8
}

Log "kubeconfig updated -- kubectl now points to '$CLUSTER_NAME'."
}

Expand Down Expand Up @@ -911,11 +1002,12 @@ $envBlock

Write-Host ""
Log "Installing $TB_NAMESPACE from $TRACEBLOC_HELM_REPO_NAME/$TRACEBLOC_CHART_NAME in namespace '$TB_NAMESPACE'..."
$null = (helm upgrade --install $TB_NAMESPACE "$TRACEBLOC_HELM_REPO_NAME/$TRACEBLOC_CHART_NAME" `
$helmOutput = (helm upgrade --install $TB_NAMESPACE "$TRACEBLOC_HELM_REPO_NAME/$TRACEBLOC_CHART_NAME" `
--namespace $TB_NAMESPACE `
--create-namespace `
--values $valuesFile 2>&1)
if ($LASTEXITCODE -ne 0) { Err "Client installation failed. Check the log for details: $LOG_FILE" }
--values $valuesFile 2>&1) | Out-String
Log "Helm Output: $helmOutput"
if ($LASTEXITCODE -ne 0) { Err "Client installation failed. Helm output:`n$helmOutput`nCheck the log for details: $LOG_FILE" }

Ok "Connected to tracebloc"
Log "Values file: $valuesFile"
Expand Down
2 changes: 1 addition & 1 deletion scripts/install.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ for ($i = 1; $i -le $maxAttempts; $i++) {
}

try {
& $ScriptDest @args
powershell.exe -ExecutionPolicy Bypass -File $ScriptDest @args
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Child process exit code not propagated to parent

High Severity

Switching from & $ScriptDest @args (same-session) to powershell.exe -ExecutionPolicy Bypass -File $ScriptDest @args (child process) means the installer's exit code is no longer propagated. The install-k8s.ps1 script uses exit 1 and exit 2 for various failure paths (including the Err function), but after the child powershell.exe terminates, $LASTEXITCODE is never checked or forwarded. The parent always exits with code 0, silently masking installation failures.

Fix in Cursor Fix in Web

} finally {
Remove-Item $TmpDir -Recurse -Force -ErrorAction SilentlyContinue
}