Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ dist/
tmp/
session_manager/events/
certs/

# AI agent working directories
.sisyphus/
93 changes: 93 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ serde_yaml = { workspace = true }
rustix = { version = "1.1" , features = ["system"] }
num_cpus = "1.17"
bytesize = "1.3"
nvml-wrapper = "0.10"

axum = { workspace = true }
pprof = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions common/src/apis/from_rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ impl From<rpc::ResourceRequirement> for ResourceRequirement {
Self {
cpu: req.cpu,
memory: req.memory,
gpu: req.gpu,
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion common/src/apis/to_rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl From<ResourceRequirement> for rpc::ResourceRequirement {
Self {
cpu: req.cpu,
memory: req.memory,
gpu: 0,
gpu: req.gpu,
}
}
}
Expand Down Expand Up @@ -202,6 +202,7 @@ impl From<&Session> for rpc::Session {
max_instances: ssn.max_instances,
batch_size: ssn.batch_size,
priority: ssn.priority,
resreq: None,
}),
status: Some(status),
}
Expand Down
54 changes: 51 additions & 3 deletions common/src/apis/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ pub struct SessionAttributes {
pub max_instances: Option<u32>,
pub batch_size: u32,
pub priority: u32,
pub resreq: Option<ResourceRequirement>,
}

impl Default for SessionAttributes {
Expand All @@ -193,6 +194,7 @@ impl Default for SessionAttributes {
max_instances: None,
batch_size: 1,
priority: 0,
resreq: None,
}
}
}
Expand Down Expand Up @@ -226,6 +228,7 @@ pub struct Session {
pub max_instances: Option<u32>,
pub batch_size: u32,
pub priority: u32,
pub resreq: Option<ResourceRequirement>,
}

#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash, strum_macros::Display)]
Expand Down Expand Up @@ -351,6 +354,7 @@ pub struct NodeInfo {
pub struct ResourceRequirement {
pub cpu: u64,
pub memory: u64,
pub gpu: i32,
}

#[derive(Clone, Debug, Default)]
Expand Down Expand Up @@ -382,6 +386,32 @@ fn totalram() -> u64 {
system::sysinfo().totalram
}

fn detect_gpus() -> i32 {
if let Ok(devices) = std::env::var("CUDA_VISIBLE_DEVICES") {
if devices.is_empty() || devices == "-1" {
return 0;
}
return devices.split(',').count() as i32;
}

match nvml_wrapper::Nvml::init() {
Ok(nvml) => match nvml.device_count() {
Ok(count) => {
tracing::info!("Detected {} GPU(s) via NVML", count);
count as i32
}
Err(e) => {
tracing::warn!("NVML initialized but failed to get device count: {}", e);
0
}
},
Err(e) => {
tracing::debug!("NVML not available, GPU detection skipped: {}", e);
0
}
}
}

impl Node {
pub fn new() -> Self {
let name = uname();
Expand All @@ -397,7 +427,8 @@ impl Node {
pub fn refresh(&mut self) {
let memory = totalram();
let cpu = num_cpus::get() as u64;
let capacity = ResourceRequirement { cpu, memory };
let gpu = detect_gpus();
let capacity = ResourceRequirement { cpu, memory, gpu };
let allocatable = capacity.clone();
let info = NodeInfo {
arch: env::consts::ARCH.to_string(),
Expand All @@ -420,6 +451,7 @@ impl From<&String> for ResourceRequirement {
let parts = s.split(',');
let mut cpu = 0;
let mut memory = 0;
let mut gpu = 0;
for p in parts {
let mut parts = p.split('=').map(|s| s.trim());
let key = parts.next();
Expand All @@ -428,12 +460,13 @@ impl From<&String> for ResourceRequirement {
(Some("cpu"), Some(value)) => cpu = value.parse::<u64>().unwrap_or(0),
(Some("memory"), Some(value)) => memory = Self::parse_memory(value),
(Some("mem"), Some(value)) => memory = Self::parse_memory(value),
(Some("gpu"), Some(value)) => gpu = value.parse::<i32>().unwrap_or(0),
_ => {
tracing::error!("Invalid resource requirement: {s}");
}
}
}
Self { cpu, memory }
Self { cpu, memory, gpu }
}
}

Expand All @@ -442,11 +475,26 @@ impl ResourceRequirement {
Self {
cpu: slots as u64 * unit.cpu,
memory: slots as u64 * unit.memory,
gpu: slots as i32 * unit.gpu,
}
}

pub fn to_slots(&self, unit: &ResourceRequirement) -> u32 {
(self.cpu / unit.cpu).min(self.memory / unit.memory) as u32
let cpu_slots = self.cpu.checked_div(unit.cpu).unwrap_or(u64::MAX);

let mem_slots = self.memory.checked_div(unit.memory).unwrap_or(u64::MAX);

let gpu_slots = if unit.gpu > 0 {
if self.gpu < 0 {
0
} else {
(self.gpu as u64) / (unit.gpu as u64)
}
} else {
u64::MAX
};
Comment on lines +487 to +495
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There's a potential issue with integer casting if self.gpu is negative. If self.gpu is negative (e.g., -1) and unit.gpu is positive, self.gpu / unit.gpu will be negative. Casting a negative i32 to u64 results in a very large positive number due to two's complement representation. This would incorrectly suggest a huge number of available GPU slots, leading to incorrect scheduling decisions.

        let gpu_slots = if unit.gpu > 0 {
            if self.gpu < 0 {
                0
            } else {
                (self.gpu as u64) / (unit.gpu as u64)
            }
        } else {
            u64::MAX // No GPU requirement = unlimited GPU slots
        };


cpu_slots.min(mem_slots).min(gpu_slots) as u32
}

pub(crate) fn parse_memory(s: &str) -> u64 {
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.console
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rust:1.94 AS builder
FROM rust:1.95 AS builder

WORKDIR /usr/src/flame
COPY . .
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.fem
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rust:1.94 AS builder
FROM rust:1.95 AS builder

WORKDIR /usr/src/flame
COPY . .
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.foc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rust:1.94 AS builder
FROM rust:1.95 AS builder

WORKDIR /usr/src/flame
COPY . .
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.fsm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rust:1.94 AS builder
FROM rust:1.95 AS builder

WORKDIR /usr/src/flame
COPY . .
Expand Down
Loading
Loading