In [1]:
import configparser

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# amazon aws
KEY = config.get('AWS', 'key')
SECRET = config.get('AWS', 'secret')

# Redshift
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')
DWH_CLUSTER_TYPE = config.get('DWH', 'DWH_CLUSTER_TYPE')
DWH_NUM_NODES = config.get('DWH', 'DWH_NUM_NODES')
DWH_NODE_TYPE = config.get('DWH', 'DWH_NODE_TYPE')
DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')
DWH_CLUSTER_IDENTIFIER = config.get('DWH', 'DWH_CLUSTER_IDENTIFIER')
DWH_SCHEMA = config.get('DWH', 'DWH_SCHEMA')
DWH_LOG_STAGING_TABLE = config.get('DWH', 'DWH_LOG_STAGING_TABLE')
DWH_SONG_STAGING_TABLE = config.get('DWH', 'DWH_SONG_STAGING_TABLE')

# s3
S3_BUCKET_LOG_JSON_PATH = config.get('S3', 'S3_BUCKET_LOG_JSON_PATH')
S3_BUCKET_SONG_JSON_PATH = config.get('S3', 'S3_BUCKET_SONG_JSON_PATH')

In [2]:
import boto3
from config import *
import json
from botocore.exceptions import ClientError
import time

iam = boto3.client('iam', aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET,
                       region_name='us-west-2'
                       )
try:
    iam.create_role(Path='/',
                    RoleName=DWH_IAM_ROLE_NAME,
                    Description="Allows Redshift clusters to call AWS services on your behalf.",
                    AssumeRolePolicyDocument=json.dumps(
                        {'Statement': [{'Action': 'sts:AssumeRole',
                          'Effect': 'Allow',
                          'Principal': {'Service': 'redshift.amazonaws.com'}}],
                         'Version': '2012-10-17'})
                    )

except ClientError as e:
    print(f'ERROR: {e}')

print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                       )['ResponseMetadata']['HTTPStatusCode']
print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
print(roleArn)

SSLError: SSL validation failed for https://iam.amazonaws.com/ [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)

In [64]:
print("1.1 Client is created ...")
redshift = boto3.client('redshift',
                        region_name="us-west-2",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )
try:
    print("1.2 Cluster config is being created ...")
    redshift.create_cluster(
        # HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        # Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,

        # Roles (for s3 access)
        IamRoles=[roleArn])
except ClientError as e:
    print(f'ERROR: {e}')

print("1.3 Cluster is being created ...")
while redshift.describe_clusters(
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)\
        ['Clusters'][0]['ClusterStatus'] != 'available':
    utils.animate()

print("\r1.4 Cluster is created successfully ...")

1.1 Client is created ...
1.2 Cluster config is being created ...
1.3 Cluster is being created ...
1.4 Cluster is created successfully ...


In [65]:
DWH_ENDPOINT = redshift.describe_clusters(
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)\
    ['Clusters'][0]['Endpoint']['Address']

In [66]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [67]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

In [68]:
%sql DROP TABLE IF EXISTS log_staging

 * postgresql://dwhuser:***@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [69]:
%%sql CREATE TABLE IF NOT EXISTS log_staging (
        artist VARCHAR, 
        auth VARCHAR, 
        firstname VARCHAR, 
        gender VARCHAR, 
        iteminsession VARCHAR, 
        lastname VARCHAR, 
        length VARCHAR, 
        level VARCHAR, 
        location VARCHAR, 
        method VARCHAR,
        page VARCHAR, 
        registration VARCHAR, 
        sessionId VARCHAR, 
        song VARCHAR,
        status VARCHAR,
        ts VARCHAR,
        useragent VARCHAR,
        userid VARCHAR); 

 * postgresql://dwhuser:***@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [136]:
%%sql 
copy log_staging
from 's3://udacity-dend/log_data/2018' 
credentials 'aws_iam_role=arn:aws:iam::764499268961:role/dwh-role'
emptyasnull
blanksasnull
json 'auto'
timeformat 'auto';

 * postgresql://dwhuser:***@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [142]:
%sql SELECT * FROM log_staging LIMIT 5

 * postgresql://dwhuser:***@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
A Fine Frenzy,Logged In,,F,,,267.91138,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044398796,,Almost Lover (Album Version),200,1541377992796,,
Nirvana,Logged In,,F,,,214.77832,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Serve The Servants,200,1541381242796,,
Television,Logged In,,F,,,238.49751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,See No Evil (Remastered LP Version),200,1541381456796,,
JOHN COLTRANE,Logged In,,F,,,346.43546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Blues To Bechet (LP Version),200,1541381694796,,
NOFX,Logged In,,F,,,80.79627999999998,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,It's My Job To Keep Punk Rock Elite,200,1541382040796,,


In [134]:
%sql SELECT err_code, err_reason FROM STL_LOAD_ERRORS ORDER BY starttime DESC LIMIT 100

 * postgresql://dwhuser:***@dwh-cluster.cgjrwscs7tjx.us-west-2.redshift.amazonaws.com:5439/dwh
97 rows affected.


err_code,err_reason
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1202,Extra column(s) found
1214,Delimited value missing end quote
1214,Delimited value missing end quote


In [29]:
import boto3
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

In [30]:
udacity_labs_bucket = s3.Bucket('udacity-dend')
for obj in udacity_labs_bucket.objects.filter(Prefix='log_data'):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-05-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-06-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-07-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-08-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-09-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-10-events.json')
s3.ObjectSummary(b

In [None]:
print("1.1 Client is created ...")
redshift = boto3.client('redshift',
                        region_name="us-west-2",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )
print("1.2 Cluster is identified ...")
try:
    redshift.delete_cluster(
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        SkipFinalClusterSnapshot=True)
except ClientError as e:
    print(f'ERROR: {e}')

try:
    print("1.3 Cluster is being deleted ...")
    while redshift.describe_clusters(
            ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)\
            ['Clusters'][0]['ClusterStatus'] == 'deleting':
        utils.animate()
except:
    print("\r1.4 Cluster is deleted successfully ...")

1.1 Client is created ...
1.2 Cluster is identified ...
1.3 Cluster is being deleted ...
Please Wait ...\

In [None]:
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)