Skip to content

Latest commit

 

History

History
266 lines (251 loc) · 6.34 KB

05_Slurm configuration.md

File metadata and controls

266 lines (251 loc) · 6.34 KB

Edit configuration file

# vi /etc/slurm/slurm.conf

https://slurm.schedmd.com/configurator.html

#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=galaxy.cluster
SlurmctldHost=master
#ControlMachine=master
#ControlAddr=192.168.1.254
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/none
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SelectType=select/linear
FastSchedule=1
#SchedulerPort=7321
SelectType=select/cons_res
SelectTypeParameters=CR_CPU
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/none
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=master
AccountingStoragePort=6819
AccountingStoreJobComment=YES
AccountingStorageLoc=slurm_acct_db
AccountingStoragePass=/var/run/munge/munge.socket.2
AccountingStorageUser=slurm
#
JobCompHost=master
JobCompLoc=slurm_acct_db
JobCompPass=slurm@1234
#JobCompPort=
#JobCompType=jobcomp/slurmdbdJobCompUser=slurm
#JobContainerPlugin=job_container/none
#
# COMPUTE NODES
# OpenHPC default configuration
PropagateResourceLimitsExcept=MEMLOCK
#Epilog=/etc/slurm/slurm.epilog.clean
#
GresTypes=gpu

#NODES
NodeName=master NodeAddr=master NodeHostName=master Gres=gpu:2 CPUs=8 CoresPerSocket=8 ThreadsPerCore=1 RealMemory=64000 State=UNKNOWN
NodeName=c1 NodeAddr=c1 NodeHostName=c1 Gres=gpu:2 CPUs=12 CoresPerSocket=12 ThreadsPerCore=1 RealMemory=64000 State=UNKNOWN
NodeName=c2 NodeAddr=c2 NodeHostName=c2 CPUs=16 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=16000 State=UNKNOWN
NodeName=c3 NodeAddr=c3 NodeHostName=c3 Gres=gpu:2 CPUs=12 CoresPerSocket=12 ThreadsPerCore=1 RealMemory=64000 State=UNKNOWN
NodeName=c4 NodeAddr=c4 NodeHostName=c4 Gres=gpu:1 CPUs=16 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=32000 State=UNKNOWN
NodeName=c5 NodeAddr=c5 NodeHostName=c5 CPUs=16 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=16000 State=UNKNOWN

#PARTITIONS
PartitionName=qcpu Nodes=c[1-5],master Default=YES State=UP MaxTime=120:00:00 DefaultTime=0:30:0 DefMemPerCPU=512 Shared=NO
PartitionName=qgpu_gtx1070ti Nodes=master,c1 State=UP MaxTime=72:00:00 DefaultTime=0:30:0 DefMemPerCPU=512 Shared=NO
PartitionName=qgpu_rtx2070 Nodes=c3 State=UP MaxTime=72:00:00 DefaultTime=0:30:0 DefMemPerCPU=512 Shared=NO
PartitionName=qgpu_rtx2080 Nodes=c4 State=UP MaxTime=72:00:00 DefaultTime=0:30:0 DefMemPerCPU=512 Shared=NO

ReturnToService=0

Create file gres.conf

# vi /etc/slurm/gres.conf
NodeName=master Name=gpu File=/dev/nvidia[0-1]
NodeName=c1 Name=gpu File=/dev/nvidia[0-1]
NodeName=c2 Name=gpu File=/dev/nvidia0
systemctl restart slurmctld
systemctl restart slurmd
systemctl restart munge
pdsh -w c[1-2] systemctl restart slurmd
pdsh -w c[1-2] systemctl restart munge
scontrol show nodes

Change state of node

# scontrol for compute node after reboot
scontrol update NodeName=c[1-2] State=RESUME

Create slurmdbd.conf

#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSteps=no
ArchiveSuspend=no
ArchiveTXN=no
ArchiveUsage=no
#ArchiveDir="/tmp"
ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdAddr=master
DbdHost=master
DbdPort=6819
SlurmUser=slurm
#MessageTimeout=300
DebugLevel=4
#DefaultQOS=normal,standby
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=localhost
StoragePort=3306
StoragePass=slurm@1234
StorageUser=slurm
StorageLoc=slurm_acct_db

PurgeEventAfter=12month
PurgeJobAfter=12month
PurgeResvAfter=2month
PurgeStepAfter=2month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=12month
MaxQueryTimeRange=60-0

Setup MySQL

# mysql -p
mysql> grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by 'slurm@1234' with grant option;
mysql> create database slurm_acct_db;
mysql> quit;
systemctl start slurmdbd
systemctl restart slurmctld
sacctmgr add cluster galaxy.cluster

Firewalld for Slurm daemons

SlurmctldPort=6817 SlurmdPort=6818 SchedulerPort=7321

yum install firewalld firewall-config
systemctl start firewalld
systemctl enable firewalld
firewall-cmd --permanent --zone=public --add-port=6817/tcp
firewall-cmd --permanent --direct --add-rule ipv4 filter INPUT_direct 0 -s 192.168.0.0/16 -j ACCEPT
firewall-cmd --permanent --zone=public --add-port=6819/tcp
firewall-cmd --reload

MySQL configuration

Create a new file /etc/my.cnf.d/innodb.cnf containing:

[mysqld]
innodb_buffer_pool_size=1024M
innodb_log_file_size=64M
innodb_lock_wait_timeout=900

To implement this change you have to shut down the database and move/remove logfiles:

systemctl stop mariadb
mv /var/lib/mysql/ib_logfile? /tmp/
systemctl start mariadb