In [58]:
from cheeto.puppet import (parse_yaml_forest,
                           validate_yaml_forest,
                           PuppetAccountMap,
                           MergeStrategy,
                           SlurmQOS,
                           SlurmQOSTRES)
from cheeto.slurm import (SControl, 
                          sanitize_tres, 
                          build_puppet_tres)
from cheeto.utils import size_to_megs

from rich import print as pprint

from pathlib import Path

In [2]:
yamls = parse_yaml_forest(['../puppet.hpc-accounts/domains/farm.hpc.ucdavis.edu/merged/all.yaml'], 
                          merge_on=MergeStrategy.ALL)

In [3]:
_, puppet_data = next(validate_yaml_forest(yamls, PuppetAccountMap, strict=True))

In [4]:
partitions = {}

In [34]:
with open('farm.partitions.txt') as fp:
    for row in SControl.get_scontrol_parser(fp):
        name = row['PartitionName']
        nodes = row['Nodes']
        san_tres = sanitize_tres(row['TRES'])
        tres = {'mem': size_to_megs(san_tres['mem']),
                'cpus': int(san_tres.get('cpu', 0)),
                'gpus': int(san_tres.get('gpu', 0))}
        print(f'{name} ({nodes}): {tres}')
        partitions[name] = {'nodes': nodes, 'tres_max': tres, 'tres_mapped': {'cpus': 0, 'mem': 0, 'gpus': 0}}

low2 (cpu-3-[50-57,62-69],cpu-4-[68-71,86-93],cpu-6-[58-77,86-97]): {'mem': 40960000, 'cpus': 8192, 'gpus': 0}
med2 (cpu-3-[50-57,62-69],cpu-4-[68-71,86-93],cpu-6-[58-77,86-97]): {'mem': 40960000, 'cpus': 8192, 'gpus': 0}
high2 (cpu-3-[50-57,62-69],cpu-4-[68-71,86-93],cpu-6-[58-77,86-97]): {'mem': 40960000, 'cpus': 8192, 'gpus': 0}
low (cpu-8-[62-77,86-96],cpu-9-[63-70,72-77,86-97],cpu-10-[15-18,66-77,86-97],cpu-11-[70-77,86-97]): {'mem': 12928000, 'cpus': 2424, 'gpus': 0}
med (cpu-8-[62-77,86-96],cpu-9-[63-70,72-77,86-97],cpu-10-[15-18,66-77,86-97],cpu-11-[70-77,86-97]): {'mem': 12928000, 'cpus': 2424, 'gpus': 0}
high (cpu-8-[62-77,86-96],cpu-9-[63-70,72-77,86-97],cpu-10-[15-18,66-77,86-97],cpu-11-[70-77,86-97]): {'mem': 12928000, 'cpus': 2424, 'gpus': 0}
bigmeml (bigmem[1-8,10]): {'mem': 9900000, 'cpus': 624, 'gpus': 0}
bigmemm (bigmem[1-8,10]): {'mem': 9900000, 'cpus': 624, 'gpus': 0}
bigmemh (bigmem[1-8]): {'mem': 7960000, 'cpus': 528, 'gpus': 0}
bigmemht (bigmem10): {'mem': 194000

In [35]:
partition_qoses = {}
for groupname, group in puppet_data.group.items():
    if group.slurm and group.slurm.partitions:
        for name, partition in group.slurm.partitions.items():
            if type(partition.qos) is not SlurmQOS:
                continue
            if partition.qos.group is None:
                continue
            qos_tres = partition.qos.group
            try:
                part_tally = partitions[name]['tres_mapped']
            except KeyError:
                print(f'Partition "{name}" in puppet but not Slurm.')
                continue
            part_tally['cpus'] += (qos_tres.cpus if qos_tres.cpus is not None else 0)
            part_tally['gpus'] += (qos_tres.gpus if qos_tres.gpus is not None else 0)
            part_tally['mem'] += (size_to_megs(qos_tres.mem) if qos_tres.mem is not None else 0)

Partition "ecl243h" in puppet but not Slurm.


In [39]:
for partname, part in partitions.items():
    tres_mapped = part['tres_mapped']
    tres_max = part['tres_max']
    ratios = {k: tres_mapped[k] / v for k, v in tres_max.items() if v != 0}
    part['oversubscribe_factor'] = ratios

In [49]:
for partname, part in partitions.items():
    if any((v > 1 for v in part['oversubscribe_factor'].values())):
        print(f'{partname}:\n\tAvailable: {part["tres_max"]}\n\tOversubscription: {part["oversubscribe_factor"]}')

high2:
	Available: {'mem': 40960000, 'cpus': 8192, 'gpus': 0}
	Oversubscription: {'mem': 0.7138671875, 'cpus': 1.00390625}
high:
	Available: {'mem': 12928000, 'cpus': 2424, 'gpus': 0}
	Oversubscription: {'mem': 0.7326732673267327, 'cpus': 1.504950495049505}
bigmemh:
	Available: {'mem': 7960000, 'cpus': 528, 'gpus': 0}
	Oversubscription: {'mem': 0.628643216080402, 'cpus': 1.0909090909090908}
bigmemht:
	Available: {'mem': 1940000, 'cpus': 96, 'gpus': 0}
	Oversubscription: {'mem': 1.0309278350515463, 'cpus': 2.0}
bit150h:
	Available: {'mem': 1000000, 'cpus': 80, 'gpus': 0}
	Oversubscription: {'mem': 0.256, 'cpus': 1.2}
gpu-a100-h:
	Available: {'mem': 2048000, 'cpus': 128, 'gpus': 8}
	Oversubscription: {'mem': 0.5, 'cpus': 2.0, 'gpus': 1.0}


In [50]:
partitions['high']

{'nodes': 'cpu-8-[62-77,86-96],cpu-9-[63-70,72-77,86-97],cpu-10-[15-18,66-77,86-97],cpu-11-[70-77,86-97]',
 'tres_max': {'mem': 12928000, 'cpus': 2424, 'gpus': 0},
 'tres_mapped': {'cpus': 3648, 'mem': 9472000, 'gpus': 0},
 'oversubscribe_factor': {'mem': 0.7326732673267327,
  'cpus': 1.504950495049505}}

In [51]:
puppet_data

PuppetAccountMap(group={'abontemp': PuppetGroupRecord(gid=20000, ensure=None, tag=None, storage=None, slurm=None), 'adamgrp': PuppetGroupRecord(gid=9998, ensure='present', tag=None, storage=[PuppetGroupStorage(name='adamroot', owner='root', group='root', autofs=PuppetAutofs(nas='nas-12-3-ib', path='/nas-12-3/adamgrp', options=None), zfs=PuppetZFS(quota='42T'))], slurm=SlurmRecord(account='adamgrp', partitions={'bml': SlurmPartition(qos=SlurmQOS(group=None, job=None, priority=0)), 'high': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=192, gpus=None, mem='512000M'), job=SlurmQOSTRES(cpus=192, gpus=None, mem=None), priority=0)), 'high2': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=352, gpus=None, mem='762000M'), job=None, priority=0)), 'low': SlurmPartition(qos='adamgrp-med-qos'), 'low2': SlurmPartition(qos='adamgrp-med2-qos'), 'med': SlurmPartition(qos=SlurmQOS(group=None, job=SlurmQOSTRES(cpus=192, gpus=None, mem=None), priority=10)), 'med2': SlurmPartition(qos=SlurmQOS(gr

In [53]:
puppet_data.user['camw'].groups

{'conda-user',
 'ctbrowngrp',
 'hippo-user',
 'hpccfgrp',
 'software-user',
 'spack-user'}

In [55]:
puppet_data.group['ctbrowngrp'].slurm

SlurmRecord(account='ctbrowngrp', partitions={'bmh': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=48, gpus=None, mem='512000M'), job=None, priority=0)), 'bml': SlurmPartition(qos=SlurmQOS(group=None, job=None, priority=10)), 'bmm': SlurmPartition(qos=SlurmQOS(group=None, job=None, priority=10)), 'gpu-a100-h': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=64, gpus=2, mem='256000M'), job=None, priority=0)), 'gpuh': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=6, gpus=1, mem='96000M'), job=None, priority=0)), 'gpum': SlurmPartition(qos='gpum-users-gpum-qos'), 'high2': SlurmPartition(qos=SlurmQOS(group=SlurmQOSTRES(cpus=224, gpus=None, mem='512000M'), job=None, priority=0)), 'low': SlurmPartition(qos='adamgrp-med-qos'), 'low2': SlurmPartition(qos=SlurmQOS(group=None, job=None, priority=20)), 'med': SlurmPartition(qos='adamgrp-med-qos'), 'med2': SlurmPartition(qos=SlurmQOS(group=None, job=None, priority=20))}, max_jobs=None)

In [56]:
def get_group_slurm_partitions(group: str):
    try:
        slurm = puppet_data.group[group].slurm
        partitions = slurm.partitions
    except (KeyError, AttributeError):
        return None, None
    else:
        return slurm.account, list(partitions.keys())

In [106]:
def get_group_storages(group: str):
    try:
        storage = puppet_data.group[group].storage
    except (KeyError, AttributeError):
        return None
    else:
        if storage is None:
            return None
        storages = []
        for storage in storage:
            if storage.owner == 'root' or 'root' in storage.name: # or storage.zfs is None or type(storage.zfs) is bool:
                continue
            path = Path('/group') / storage.name
            if storage.zfs not in (None, True, False):
                quota = storage.zfs.quota
            else:
                quota = None
            storages.append((path, quota))
        return storages


In [97]:
get_group_storages('adamgrp')

[]

In [107]:
for group in puppet_data.group.keys():
    pprint(group, get_group_storages(group))

In [101]:
get_group_slurm_partitions('adamgrp')

('adamgrp', ['bml', 'high', 'high2', 'low', 'low2', 'med', 'med2'])

In [60]:
p = Path('../puppet.hpc-accounts/domains/franklin.hpc.ucdavis.edu/')

In [63]:
p.name

'franklin.hpc.ucdavis.edu'

In [110]:
puppet_data.group['ctbrowngrp'].storage

[PuppetGroupStorage(name='ctbrownroot', owner='ctbrown', group=None, autofs=PuppetAutofs(nas='nas-6-0-ib', path='/nas-6-0/ctbrowngrp', options=None), zfs=PuppetZFS(quota='220T')),
 PuppetGroupStorage(name='ctbrowngrp', owner='ctbrown', group=None, autofs=PuppetAutofs(nas='nas-6-0-ib', path='/nas-6-0/ctbrowngrp/group', options=None), zfs=None),
 PuppetGroupStorage(name='ctbrowngrp2', owner='ctbrown', group=None, autofs=PuppetAutofs(nas='nas-4-0-ib', path='/nas-4-0/ctbrowngrp2', options=None), zfs=PuppetZFS(quota='100T')),
 PuppetGroupStorage(name='ctbrowngrp3', owner='ctbrown', group=None, autofs=PuppetAutofs(nas='nas-4-0-ib', path='/nas-4-0/ctbrowngrp3', options=None), zfs=PuppetZFS(quota='50T'))]