# PRE-CONDITION: Have a programmatic user account in AWS, fill in dwh.cfg file with its Key and Secret:
[AWS_MANAGER]
KEY=HERE
SECRET=HERE

# 1. Load AWS DWH Params from dwh.cfg file

In [1]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                        = config.get('AWS_MANAGER','KEY')
SECRET                     = config.get('AWS_MANAGER','SECRET')

EMR_CLUSTER_NAME           = config.get("CLUSTER","NAME")
EMR_EC2_ATTRIBUTES_KEYNAME = config.get("CLUSTER","EC2_ATTRIBUTES_KEYNAME")
EMR_INSTANCE_TYPE          = config.get("CLUSTER","INSTANCE_TYPE")
EMR_INSTANCE_COUNT         = config.get("CLUSTER","INSTANCE_COUNT")

EMR_RELEASE_LABEL          = config.get("CLUSTER","RELEASE_LABEL")
EMR_BOOTSTRAP_ACTIONS_PATH = config.get("CLUSTER","BOOTSTRAP_ACTIONS_PATH")
EMR_APPLICATIONS_NAME      = config.get("CLUSTER","APPLICATIONS_NAME")

AWS_REGION                 = config.get("AWS_CONF", "REGION")


# 2. Create EMR client

In [2]:
import boto3

emr_client = boto3.client('emr',region_name=AWS_REGION,aws_access_key_id=KEY,aws_secret_access_key=SECRET)

# 3. Create EMR instance

In [11]:
cluster_id = emr_client.run_job_flow(Name=EMR_CLUSTER_NAME,ReleaseLabel=EMR_RELEASE_LABEL,
    Applications=[
        {
            'Name': EMR_APPLICATIONS_NAME
        },
    ],
    Instances=[{
        'InstanceGroups': [
            {
                'Name': EMR_CLUSTER_NAME,
                'Market': 'ON_DEMAND',
                'InstanceType': EMR_INSTANCE_TYPE,
                'InstanceCount': 2,
                'InstanceRole': 'MASTER'
            }
        ],
        'Ec2KeyName': KEY,
        'KeepJobFlowAliveWhenNoSteps': False,
    }],
    
    BootstrapActions=[
        {
            'Name': 'Install elementary tools',
            'ScriptBootstrapAction': {
                'Path': 's3://data-eng-nanodegree/emr_spark_bootstrap.sh',
            }
        },
    ],
)

ParamValidationError: Parameter validation failed:
Invalid type for parameter Instances, value: [{'InstanceGroups': [{'Name': 'EMR_SPARK', 'Market': 'ON_DEMAND', 'InstanceType': 'm5.xlarge', 'InstanceCount': 2, 'InstanceRole': 'MASTER'}], 'Ec2KeyName': 'AKIAY3U5YEETGKQGIFPK', 'KeepJobFlowAliveWhenNoSteps': False}], type: <class 'list'>, valid types: <class 'dict'>

In [None]:
try:
    iam_client.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                           PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                           )['ResponseMetadata']['HTTPStatusCode']
except Exception as e:
    print(e)

In [None]:
roleArn = iam_client.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

# 4. Create Redshift (RS) Cluster

In [None]:
try:
    response = rs_client.create_cluster(        
        # parameters for hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        # identifiers & credentials
        DBName=DB_NAME,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        # role (to allow s3 access)
        IamRoles=[roleArn] 
    )
except Exception as e:
    print(e)

### 4.1. Check cluster status executing below line as many times needed. Move to the next commands only when the cluster status becomes available

In [None]:
import pandas as pd

def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = rs_client.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

## 5. Take note of the Cluster endpoint and ARN 
## ===> Copy below information to *dwh.cfg* file

[CLUSTER]
HOST=HERE
[IAM]
ARN=HERE


In [None]:
HOST = myClusterProps['Endpoint']['Address']
ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("HOST :: ", HOST)
print("ARN :: ", ARN)

# 6. AWS RESOURCES COSTS MONEY, CLEAN IT UP AT THE END OF YOUR PROJECT EXECUTION!!!!!!

### Delete Redshift created cluster

In [None]:
rs_client.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

#### Don't leave this notebook while the ClusterStatus is deleting, wait for an error message.

In [None]:
import pandas as pd

def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = rs_client.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

### Detach and delete created IAM role

In [None]:
iam_client.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam_client.delete_role(RoleName=DWH_IAM_ROLE_NAME)