# Project: Data Warehouse

In [15]:
import pandas as pd
import boto3
import json
import psycopg2
import pandas.io.sql as sqlio

## STEP 1: Create Redshift Cluster

### 1.1: Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Edit the file `redshift.cfg` in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>
<font/>


### 1.2 Load DWH Params from a file

In [16]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('redshift.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,myRedshiftRole


### 1.3 Create clients for IAM, EC2, S3 and Redshift

In [17]:
import boto3

ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

### 1.4: Creat and attach IAM ROLE
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [18]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path = '/',
        RoleName = DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument = json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    

print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName = DWH_IAM_ROLE_NAME,
                       PolicyArn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName = DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name myRedshiftRole already exists.
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::754152170941:role/myRedshiftRole


### 1.5 Create Redshift Cluster

- Create a RedShift Cluster
- For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [19]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType = DWH_CLUSTER_TYPE,
        NodeType = DWH_NODE_TYPE,
        NumberOfNodes = int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName = DWH_DB,
        ClusterIdentifier = DWH_CLUSTER_IDENTIFIER,
        MasterUsername = DWH_DB_USER,
        MasterUserPassword = DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles = [roleArn]  
    )
except Exception as e:
    print(e)

### 1.6 *Describe* the cluster to see its status
- run this block several times until the cluster status becomes `Available`

In [21]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId', 'AvailabilityZone']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier = DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.ciz53qbikdxc.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-8176c4f9
7,AvailabilityZone,us-west-2b
8,NumberOfNodes,4


<h3> 1.7 Take note of the cluster <font color='red'> endpoint and role ARN </font> </h2>

<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

In [22]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.ciz53qbikdxc.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::754152170941:role/myRedshiftRole


* Take note of the `DWH_ENDPOINT` fill the `HOST` value in `dwh.cfg` file.
* Take note of the `DWH_ROLE_ARN` fill the `ARN` value in `dwh.cfg` file.

### 1.8 Open an incoming  TCP port to access the cluster ednpoint

In [23]:
try:
    vpc = ec2.Vpc(id = myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName = defaultSg.group_name,
        CidrIp = '0.0.0.0/0',
        IpProtocol = 'TCP',
        FromPort = int(DWH_PORT),
        ToPort = int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-b81fa7f7')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


## STEP 2: Create Tables
* Connect to the database and create staging, fact and dimension tables

In [24]:
%run create_tables.py

Database connected
Dropping existing tables...
All existing dropped
Creating tables...
All tables created


## STEP 3: ETL
* Load data from S3 to staging tables
* Load data from staging tables to analytics tables

In [25]:
%run etl.py

Loading staging tables...
Staging tables loaded
Inserted data into tables...
All data inserted into tables


## STEP 4: Test
* Displays the first few rows of each table confirm the records are successfully inserted

In [26]:
config.read('dwh.cfg')

conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
cur = conn.cursor()

In [27]:
sql = 'SELECT * FROM songplays LIMIT 5;'
sqlio.read_sql_query(sql, conn)

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,57,2018-11-07 00:40:08.796,97,paid,,,293,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
1,121,2018-11-06 15:56:20.796,2,free,,,126,"Plymouth, IN","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
2,185,2018-11-06 16:09:06.796,2,free,,,126,"Plymouth, IN","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
3,249,2018-11-07 13:56:41.796,15,paid,,,221,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
4,313,2018-11-03 19:21:49.796,95,paid,,,152,"Winston-Salem, NC","""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"""


In [28]:
sql = 'SELECT * FROM users LIMIT 5;'
sqlio.read_sql_query(sql, conn)

Unnamed: 0,user_id,first_name,last_name,gender,level
0,95,Sara,Johnson,F,paid
1,37,Jordan,Hicks,F,free
2,10,Sylvie,Cruz,F,free
3,83,Stefany,White,F,free
4,29,Jacqueline,Lynch,F,free


In [29]:
sql  = 'SELECT * FROM songs LIMIT 5;'
sqlio.read_sql_query(sql, conn)

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOLLHMX12AB01846DC,The Emperor Falls,AR1Y2PT1187FB5B9CE,0,484.62322
1,SOAUIQZ12A8C13E7A2,Song of Doing Laundry,AROEG4C1187B99DC4A,0,258.87302
2,SOBVVUL12AB0180166,The Modern Leper,AR9W55Y1187FB4AA37,2007,219.74159
3,SOENUJX12AB017EF68,Procession,AR0YHRY1187FB58249,2005,100.44036
4,SOIVSQZ12A6D4F68BF,I'm The Man (Explicit) (Feat. Jeru The Damaja And Lil Dap),ARDSWIE1187FB39056,1992,244.32281


In [30]:
sql  = 'SELECT * FROM artists LIMIT 5;'
sqlio.read_sql_query(sql, conn)

Unnamed: 0,artist_id,name,location,latitude,longitude
0,AR1C2IX1187B99BF74,Broken Spindles,,,
1,ARBDGN21187FB4C201,Weatherbox,"San Diego, CA",,
2,ARUBN8J1187B9AC085,Nox,"The Hague, Netherlands",52.08399,4.31741
3,ARW90Y31187B98FC7C,Pinchers,,,
4,ARQOG1Y1187B99BF80,The Far East Movement featuring Lil Rob and Baby Bash,California - LA,,


In [32]:
sql = 'SELECT * FROM time LIMIT 5;'
sqlio.read_sql_query(sql, conn)

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-03 15:36:08.796,15,3,44,11,2018,7
1,2018-11-03 16:21:41.796,16,3,44,11,2018,7
2,2018-11-03 16:50:42.796,16,3,44,11,2018,7
3,2018-11-07 00:28:00.796,0,7,45,11,2018,4
4,2018-11-07 00:52:29.796,0,7,45,11,2018,4


## STEP 5: Clean up the resources

In [33]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.ciz53qbikdxc.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2019, 7, 14, 16, 4, 5, 550000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-b81fa7f7',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-8176c4f9',
  'AvailabilityZone': 'us-west-2b',
  'PreferredMaintenanceWindow': 'mon:07:30-mon:08:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible':

- run this block several times until the cluster really deleted

In [13]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.

In [14]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
#iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)
#### CAREFUL!!

{'ResponseMetadata': {'RequestId': '15c92307-a623-11e9-b851-edd318e9ee3e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '15c92307-a623-11e9-b851-edd318e9ee3e',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Sun, 14 Jul 2019 10:35:22 GMT'},
  'RetryAttempts': 0}}