# Spool up EMR Cluster to run HAIL

In [1]:
import boto3
import time
import sys
import botocore
import paramiko
import re
import os
import yaml, re

In [2]:
PATH = os.path.abspath(os.getcwd())
PATH

'/data/hail-on-AWS-spot-instances/src'

In [3]:
#Setup logging
import logging, bluebee
from bluebee import bgp

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
logFile = f'{PATH}/spoolup_EMR_cluster.log'
fh = logging.FileHandler(logFile)
fh.setLevel(logging.INFO)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)-18s-%(levelname)-8s %(message)s', datefmt='%d%b%Y %H:%M:%S')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(ch)
logger.addHandler(fh)
# bgp.api.dump_curl = True
# bluebee.logger.setLevel(logging.DEBUG)

2021021016


In [4]:
#Get the configuration as a yaml object
c=yaml.load(open(PATH+"/config_EMR_spot.yaml"),Loader=yaml.SafeLoader)
logger.info(f'Configuration settings: {c}')

26Mar2021 22:35:43-INFO     Configuration settings: {'config': {'EMR_RELEASE': 'emr-6.2.0', 'EMR_CLUSTER_NAME': 'tdeboer-hail', 'EC2_NAME_TAG': 'tdeboer-hail-EMR', 'OWNER_TAG': 'tdeboer-ilmn', 'PROJECT_TAG': 'GRE_on_ICA', 'REGION': 'us-east-1', 'MASTER_INSTANCE_TYPE': 'm4.large', 'WORKER_INSTANCE_TYPE': 'r4.4xlarge', 'WORKER_COUNT': '1', 'WORKER_BID_PRICE': '0.50', 'MASTER_HD_SIZE': '250', 'WORKER_HD_SIZE': '500', 'SUBNET_ID': '', 'S3_BUCKET': 's3n://ilmn-hail/', 'KEY_NAME': 'hail-ES-GRE', 'PATH_TO_KEY': '/data/', 'WORKER_SECURITY_GROUP': 'sg-0df1e5704ca2a8196', 'MASTER_SECURITY_GROUP': 'sg-0bab1202c0aa453b3', 'HAIL_VERSION': 'current'}}


In [5]:
#Store or update the bootstrapping file into the S# bucket (same as used for the logs, for now)
#TODO

In [6]:
# Spot instances and different CORE/MASTER instances
command='aws emr create-cluster --applications Name=Hadoop Name=Spark --tags \'project='+\
c['config']['PROJECT_TAG']+'\' \'Owner='+c['config']['OWNER_TAG']+\
'\' \'Name='+c['config']['EC2_NAME_TAG']+'\' --ec2-attributes \'{"KeyName":"'+\
c['config']['KEY_NAME']+'","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"'+\
c['config']['SUBNET_ID']+'","EmrManagedSlaveSecurityGroup":"'+c['config']['WORKER_SECURITY_GROUP']+\
'","EmrManagedMasterSecurityGroup":"'+c['config']['MASTER_SECURITY_GROUP']+\
'"}\' --service-role EMR_DefaultRole --release-label \'' + c['config']['EMR_RELEASE'] + '\' --log-uri \''+\
c['config']['S3_BUCKET']+'\' --name \''+c['config']['EMR_CLUSTER_NAME']+\
'\' --instance-groups \'[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":'+\
c['config']['MASTER_HD_SIZE']+',"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"'+\
c['config']['MASTER_INSTANCE_TYPE']+'","Name":"Master-Instance"},{"InstanceCount":'+\
c['config']['WORKER_COUNT']+',"BidPrice":"'+c['config']['WORKER_BID_PRICE']+\
'","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":'+\
c['config']['WORKER_HD_SIZE']+',"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"'+\
c['config']['WORKER_INSTANCE_TYPE']+\
'","Name":"Core-Group"}]\' --configurations \'[{"Classification":"spark","Properties":{"maximizeResourceAllocation":"true"}},{"Classification":"yarn-site","Properties":{"yarn.nodemanager.vmem-check-enabled":"false"},"Configurations":[]}]\' --auto-scaling-role EMR_AutoScaling_DefaultRole --ebs-root-volume-size 32 --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region '+\
c['config']['REGION']
logger.info(f'Executing following command: \n{command}')

26Mar2021 22:35:43-INFO     Executing following command: 
aws emr create-cluster --applications Name=Hadoop Name=Spark --tags 'project=GRE_on_ICA' 'Owner=tdeboer-ilmn' 'Name=tdeboer-hail-EMR' --ec2-attributes '{"KeyName":"hail-ES-GRE","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"","EmrManagedSlaveSecurityGroup":"sg-0df1e5704ca2a8196","EmrManagedMasterSecurityGroup":"sg-0bab1202c0aa453b3"}' --service-role EMR_DefaultRole --release-label 'emr-6.2.0' --log-uri 's3n://ilmn-hail/' --name 'tdeboer-hail' --instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":250,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m4.large","Name":"Master-Instance"},{"InstanceCount":1,"BidPrice":"0.50","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":500,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"r4.4xlarge","Name":"Core-Group

In [7]:
cluster_id_json=os.popen(command).read()
#My default profile exports TEXT
cluster_id=re.split('\s',cluster_id_json)[1]
# Gives EMR cluster information
client_EMR = boto3.client('emr', region_name=c['config']['REGION'])
logger.warning(f'Created cluster with ID: "{cluster_id}"')

26Mar2021 22:35:45-INFO     Found credentials in shared credentials file: ~/.aws/credentials


In [8]:
# Cluster state update
status_EMR='STARTING'
tic = time.time()
# Wait until the cluster is created
logger.info('Creating EMR...')

while (status_EMR!='EMPTY'):
    details_EMR=client_EMR.describe_cluster(ClusterId=cluster_id)
    status_EMR=details_EMR.get('Cluster').get('Status').get('State')
    logger.info('Cluster status: '+status_EMR)
    time.sleep(30)
    if (status_EMR=='WAITING'):
        logger.warning('Cluster successfully created! Starting HAIL installation...')
        toc=time.time()-tic
        logger.warning("Total time to provision your cluster: %.2f "%(toc/60)+" minutes")
        break
    if (status_EMR=='TERMINATED_WITH_ERRORS' or status_EMR=='TERMINATING'):
        err = "Cluster un-successfully created. Ending installation..."
        logger.error(err)
        sys.exit(err)

26Mar2021 22:35:45-INFO     Creating EMR...
26Mar2021 22:35:45-INFO     Cluster status: STARTING
26Mar2021 22:36:15-INFO     Cluster status: STARTING
26Mar2021 22:36:45-INFO     Cluster status: STARTING
26Mar2021 22:37:15-INFO     Cluster status: STARTING
26Mar2021 22:37:45-INFO     Cluster status: STARTING
26Mar2021 22:38:15-INFO     Cluster status: STARTING
26Mar2021 22:38:46-INFO     Cluster status: STARTING
26Mar2021 22:39:16-INFO     Cluster status: STARTING
26Mar2021 22:39:46-INFO     Cluster status: STARTING
26Mar2021 22:40:16-INFO     Cluster status: STARTING
26Mar2021 22:40:46-INFO     Cluster status: STARTING
26Mar2021 22:41:16-INFO     Cluster status: STARTING
26Mar2021 22:41:46-INFO     Cluster status: STARTING
26Mar2021 22:42:16-INFO     Cluster status: STARTING
26Mar2021 22:42:47-INFO     Cluster status: STARTING
26Mar2021 22:43:17-INFO     Cluster status: STARTING
26Mar2021 22:43:47-INFO     Cluster status: STARTING
26Mar2021 22:44:17-INFO     Cluster status: WAITING


In [9]:
# Get public DNS from master node
master_dns=details_EMR.get('Cluster').get('MasterPublicDnsName')
master_IP=re.sub("-",".",master_dns.split(".")[0].split("ec2-")[1])
logger.info(f'Master IP address: {master_IP}')

26Mar2021 22:44:47-INFO     Master IP address: 54.159.68.151


In [10]:
#Get ssh connection to master instance
logger.info('Creating secure SSH connection to instance...')
key = paramiko.RSAKey.from_private_key_file(c['config']['PATH_TO_KEY']+c['config']['KEY_NAME']+'.pem')
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=master_IP, username="hadoop", pkey=key)
#Run a quick test on the connection
stdin, stdout, stderr = client.exec_command('echo "Hello"')
assert stdout.readline() == 'Hello\n'

26Mar2021 22:44:47-INFO     Creating secure SSH connection to instance...
26Mar2021 22:44:47-INFO     Connected (version 2.0, client OpenSSH_7.4)
26Mar2021 22:44:47-INFO     Authentication (publickey) successful!


In [11]:
# Copy the key into the master
stdin, stdout, stderr = client.exec_command('mkdir -p ~/.ssh/id_rsa')
command='scp -o \'StrictHostKeyChecking no\' -i '+\
c['config']['PATH_TO_KEY']+c['config']['KEY_NAME']+'.pem '+\
c['config']['PATH_TO_KEY']+c['config']['KEY_NAME']+'.pem hadoop@'+\
master_dns+':/home/hadoop/.ssh/id_rsa'
logger.info(f'Copying the keys to the instance with command: \n{command}')
assert os.system(command) == 0

26Mar2021 22:44:47-INFO     Copying the keys to the instance with command: 
scp -o 'StrictHostKeyChecking no' -i /data/hail-ES-GRE.pem /data/hail-ES-GRE.pem hadoop@ec2-54-159-68-151.compute-1.amazonaws.com:/home/hadoop/.ssh/id_rsa


In [12]:
# Copy the hail installation script into the master
command='scp -o \'StrictHostKeyChecking no\' -i '+\
c['config']['PATH_TO_KEY']+c['config']['KEY_NAME']+'.pem '+\
PATH+'/install_hail_and_python.sh hadoop@'+master_dns+':/home/hadoop'
logger.info(f'Copying the hail installation script to the instance with command: \n{command}')
assert os.system(command) == 0

26Mar2021 22:44:48-INFO     Copying the hail installation script to the instance with command: 
scp -o 'StrictHostKeyChecking no' -i /data/hail-ES-GRE.pem /data/hail-on-AWS-spot-instances/src/install_hail_and_python.sh hadoop@ec2-54-159-68-151.compute-1.amazonaws.com:/home/hadoop


In [13]:
# Copy the installation script into the master
command='scp -o \'StrictHostKeyChecking no\' -i '+\
c['config']['PATH_TO_KEY']+c['config']['KEY_NAME']+'.pem '+\
PATH+'/install_python.sh hadoop@'+master_dns+':/home/hadoop'
logger.info(f'Copying the python installation script to the instance with command: \n{command}')
assert os.system(command) == 0

26Mar2021 22:44:48-INFO     Copying the python installation script to the instance with command: 
scp -o 'StrictHostKeyChecking no' -i /data/hail-ES-GRE.pem /data/hail-on-AWS-spot-instances/src/install_python.sh hadoop@ec2-54-159-68-151.compute-1.amazonaws.com:/home/hadoop


In [14]:
#Install the software on master
VERSION=c['config']['HAIL_VERSION']
logger.info(f'Installing hail version {VERSION} et. al on Master')
logger.warning(f'This is the public JupyterLab link: "http://{master_IP}:8192"')
command='./install_hail_and_python.sh -v '+ VERSION
logger.info(f'Executing remote command: "{command}"')
stdin, stdout, stderr = client.exec_command('cd /home/hadoop/')
stdin, stdout, stderr = client.exec_command(command)
for line in stdout:
    logger.info(line.rstrip())

26Mar2021 22:44:48-INFO     Installing hail version current et. al on Master
26Mar2021 22:44:48-INFO     Executing remote command: "./install_hail_and_python.sh -v current"
26Mar2021 22:44:49-INFO     Keys successfully copied to the worker nodes
26Mar2021 22:44:50-INFO     Loaded plugins: extras_suggestions, langpacks, priorities, update-motd
26Mar2021 22:44:51-INFO     3 packages excluded due to repository priority protections
26Mar2021 22:44:52-INFO     No package python37-devel available.
26Mar2021 22:44:52-INFO     Resolving Dependencies
26Mar2021 22:44:52-INFO     --> Running transaction check
26Mar2021 22:44:52-INFO     ---> Package python3.x86_64 0:3.7.9-1.amzn2.0.1 will be updated
26Mar2021 22:44:52-INFO     --> Processing Dependency: python3 = 3.7.9-1.amzn2.0.1 for package: python3-test-3.7.9-1.amzn2.0.1.x86_64
26Mar2021 22:44:52-INFO     --> Processing Dependency: python3 = 3.7.9-1.amzn2.0.1 for package: python3-tools-3.7.9-1.amzn2.0.1.x86_64
26Mar2021 22:44:52-INFO     --> P

In [15]:
#If there were any errors, they would show up here
for line in stderr:
    logger.error(line.rstrip())

26Mar2021 22:49:47-ERROR    Existing lock /var/run/yum.pid: another copy is running as pid 14007.
26Mar2021 22:49:47-ERROR    Another app is currently holding the yum lock; waiting for it to exit...
26Mar2021 22:49:47-ERROR      The other application is: yum
26Mar2021 22:49:47-ERROR        Memory : 141 M RSS (358 MB VSZ)
26Mar2021 22:49:47-ERROR        Started: Fri Mar 26 22:44:58 2021 - 00:10 ago
26Mar2021 22:49:47-ERROR        State  : Running, pid: 14007
26Mar2021 22:49:47-ERROR    Existing lock /var/run/yum.pid: another copy is running as pid 14352.
26Mar2021 22:49:47-ERROR    Another app is currently holding the yum lock; waiting for it to exit...
26Mar2021 22:49:47-ERROR      The other application is: yum
26Mar2021 22:49:47-ERROR        Memory :  49 M RSS (265 MB VSZ)
26Mar2021 22:49:47-ERROR        Started: Fri Mar 26 22:45:24 2021 - 00:03 ago
26Mar2021 22:49:47-ERROR        State  : Running, pid: 14352
26Mar2021 22:49:47-ERROR    Another app is currently holding the yum lock; w

In [16]:
# close the client connection
logger.info('Closing the client connection')
client.close()

26Mar2021 22:49:47-INFO     Closing the client connection


In [17]:
logger.warning(f'Successfully started ENR cluster "{cluster_id}"')

