Skip to content

Commit

Permalink
Fix bug where single node cannot be used (Azure#2077)
Browse files Browse the repository at this point in the history
* add hostfile check with only one node

* change logic

* add changes to training example
  • Loading branch information
cassieesvelt committed Feb 16, 2023
1 parent 15053a4 commit 008dc62
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 15 deletions.
24 changes: 17 additions & 7 deletions cli/jobs/deepspeed/deepspeed-autotuning/src/start-deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
az_batch_host_list="$AZ_BATCH_HOST_LIST"
RANK="$AZUREML_CR_NODE_RANK"

# Start ssh
# Get ssh key from generated-key and add it to the current node.
mkdir -p /root/.ssh
mkdir /var/run/sshd
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
Expand All @@ -18,11 +18,18 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
/usr/sbin/sshd -D -p 1143 &

# Create hostfile. Use num_gpus_per_node to populate slots value.
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
IFS=$oldIFS
## Create hostfile. Use num_gpus_per_node to populate slots value.
# parse az_batch_host_list so host_list contains list of host nodes. If it does not exist, then we are only using one node.
if [[ -z $AZ_BATCH_HOST_LIST ]]
then
host_list="localhost"
else
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
IFS=$oldIFS
fi

# Create and write hosts to hostfile.
sudo mkdir /job
if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
Expand All @@ -33,20 +40,23 @@ then
done
fi

# Show hostfile
echo Hostfile generated
echo ------------
cat /job/hostfile
echo ------------

# Create deepspeed call
# Create deepspeed call using arguements passed in.
ds_call="deepspeed --hostfile /job/hostfile "
shift
shift # Shift over to remove the first arguement (already used in hostfile above)
for i in "$@"
do
ds_call+=$i
ds_call+=" "
done
ls

# Evaluate deepspeed command only in first process.
if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
echo rank is 0, starting deepspeed
Expand Down
26 changes: 18 additions & 8 deletions cli/jobs/deepspeed/deepspeed-training/src/start-deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
az_batch_host_list="$AZ_BATCH_HOST_LIST"
RANK="$AZUREML_CR_NODE_RANK"

# Start ssh
# Get ssh key from generated-key and add it to the current node.
mkdir -p /root/.ssh
mkdir /var/run/sshd
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
Expand All @@ -18,11 +18,18 @@ touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
/usr/sbin/sshd -D -p 1143 &

# Create hostfile. Use num_gpus_per_node to populate the slots variable.
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
IFS=$oldIFS
## Create hostfile. Use num_gpus_per_node to populate slots value.
# parse az_batch_host_list so host_list contains list of host nodes. If it does not exist, then we are only using one node.
if [[ -z $AZ_BATCH_HOST_LIST ]]
then
host_list="localhost"
else
oldIFS=IFS
IFS=',' read -ra host_list <<< "$az_batch_host_list"
IFS=$oldIFS
fi

# Create and write hosts to hostfile.
sudo mkdir /job
if [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
Expand All @@ -33,24 +40,27 @@ then
done
fi

# Show hostfile
echo Hostfile generated
echo ------------
cat /job/hostfile
echo ------------

# Create deepspeed call
# Create deepspeed call using arguements passed in.
ds_call="deepspeed --hostfile /job/hostfile "
shift
shift # Shift over to remove the first arguement (already used in hostfile above)
for i in "$@"
do
ds_call+=$i
ds_call+=" "
done
ls

# Evaluate deepspeed command only in first process.
if [[ $RANK == 0 ]] && [[ $AZUREML_PROCESS_NAME == "rank_0" ]]
then
echo rank is 0, starting deepspeed
sleep 60
echo $ds_call
eval $ds_call
fi
fi

0 comments on commit 008dc62

Please sign in to comment.