<a href="https://colab.research.google.com/github/tadinve/rsa_on_aws/blob/main/notebook/01_Cadabra_Data_Ingestions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - AWS Setup


1. Do this only Once
- Create an account in AWS
- Create an IAM User and downlod credentials file (you will need access and secret key)
- run aws init
- copy credentials file to AWS_CRED_DIR


In [None]:
%%bash
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip -qq awscliv2.zip
sudo ./aws/install

In [None]:
from google.colab import drive
drive.mount('/gdrive')

## Define Directory Variables

In [None]:
def escapeSlash(s):
    return s.replace("/","\/")

In [None]:
#python variables
HOME = "/root"
AWS_CRED_DIR = "/gdrive/MyDrive/AWS/ca_dev/"
AWS_CRED_FILE = AWS_CRED_DIR + "credentials"
AWS_CONFIG_FILE = AWS_CRED_DIR + "config"
TF_DIR = "/gdrive/MyDrive/AWS/AWS-TF/Cadabra"
TRANSACTIONS_DIR="/data/"
NAME_INITIALS = "" #ADD YOUR NAME INTIALS **********************************

In [None]:
#shell variables using python variables
%env AWS_CRED_DIR={AWS_CRED_DIR}
%env AWS_CRED_FILE={AWS_CRED_FILE}

## Check for Credentials file

https://linuxize.com/post/bash-check-if-file-exists/

In [None]:
%%bash
if [ ! -f "$AWS_CRED_FILE" ]; then 
    aws configure 
fi

In [None]:
!mkdir -p ~/.aws
!cp {AWS_CRED_DIR}c* ~/.aws

In [None]:
#Read the credentials
f = open(AWS_CRED_FILE)
header = f.readline()
line = f.readline().split("=")
access_key=line[1].strip()
line = f.readline().split("=")
secret_key=escapeSlash(line[1].strip())
f.close()
f = open(AWS_CONFIG_FILE)
header = f.readline()
line = f.readline().split("=")
user_region=line[1].strip()

In [None]:
#import os
#os.environ["AWS_ACCESS_KEY_ID"]=access_key
#os.environ["AWS_SECRET_ACCESS_KEY"]=secret_key
#os.environ["AWS_DEFAULT_REGION"]=user_region
#os.environ['AWS_PROFILE'] = "default"

# Install Kinesis & Start Agent



## Install Java 1.8
- needed by Kinesis build
- from https://stackoverflow.com/questions/58106622/how-to-change-the-java-version-in-google-colab

In [None]:
%cd {HOME}

In [None]:
!sudo apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

## Get Kinesis Code from Github and Build

In [None]:
cd {HOME}

In [None]:
KINESIS_AGENT_VER = "1.1.4"

In [None]:
!curl -LO https://github.com/awslabs/amazon-kinesis-agent/archive/{KINESIS_AGENT_VER}.tar.gz
!tar -xvzf {KINESIS_AGENT_VER}.tar.gz

In [None]:
!mv amazon-kinesis-agent-{KINESIS_AGENT_VER} amazon-kinesis-agent
%cd amazon-kinesis-agent
!sudo ./setup --install

## Configure Kinesis Agent

In [None]:
cd /etc/aws-kinesis

In [None]:
%%writefile agent.json
{
  "cloudwatch.emitMetrics": true,
  "awsAccessKeyId": "ACCESS_KEY",
  "awsSecretAccessKey": "SECRET_KEY",
  "firehose.endpoint": "firehose.USER_REGION.amazonaws.com",
  "kinesis.endpoint": "kinesis.USER_REGION.amazonaws.com",
 
  "flows": [
    {
      "filePattern": "/data/*.log",
      "deliveryStream": "tf-cadabra_batch_sales",
      "initialPosition": "START_OF_FILE"
    },
    {
      "filePattern": "/data/*.log",
      "kinesisStream": "tf-order-stream",
      "partitionKeyOption": "RANDOM",
      "dataProcessingOptions": [
         {
            "optionName": "CSVTOJSON",
            "customFieldNames": ["InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate", "UnitPrice", "Customer", "Country"]
         }
      ]
    },
    {
      "filePattern": "/var/log/httpd/*.log",
      "deliveryStream": "tf-weblogs",
      "initialPosition": "START_OF_FILE"
    },
    {
      "filePattern": "/data/*.log",
      "kinesisStream": "tf-redshift-datastream",
      "partitionKeyOption": "RANDOM"
    }
  ]
}

In [None]:
!sed -i 's/USER_REGION/{user_region}/g' agent.json
!sed -i 's/ACCESS_KEY/{access_key}/g' agent.json
!sed -i 's/SECRET_KEY/{secret_key}/g' agent.json

In [None]:
!cat agent.json

In [None]:
!ls -alh /etc/aws-kinesis/ 

## Start Kinesis Agent

In [None]:
!sudo service aws-kinesis-agent status

In [None]:
!sudo service aws-kinesis-agent start

In [None]:
!sleep 30

In [None]:
!cat /var/log/aws-kinesis-agent/aws-kinesis-agent.log  

# Generate Data

In [None]:
!mkdir -p {TRANSACTIONS_DIR}
%cd {HOME}

## Get Requirements

In [None]:
#get requirements.txt
!wget --no-check-certificate -q 'https://docs.google.com/uc?export=download&id=1-HiavLHD1DPYov9YVyH-xJ4ggdN4AahF' -O requirements.txt

In [None]:
!pip install -r requirements.txt 

## Download Source Files and Data Template

In [None]:
!ls -ltr

In [None]:
!wget --no-check-certificate -q 'https://docs.google.com/uc?export=download&id=1gDonM3p8OKk9nKCb853N3jxLZ5qsE9pF' -O cadabra_products.csv

In [None]:
!wget https://raw.githubusercontent.com/tadinve/rsa_on_aws/main/src/python_code/transaction_generator.py -O transaction_generator.py

In [None]:
!wget --no-check-certificate -q 'https://raw.githubusercontent.com/tadinve/rsa_on_aws/main/src/python_code/create_redshift_tables.py' -O create_redshift_tables.py

In [None]:
#Create Target Schema and Tables in Redshift
from create_redshift_tables import create_schema_tables
create_schema_tables('dev','awsuser')

In [None]:
!wget -O access.log.zip https://www.dropbox.com/s/8spgz9gfbeaa34k/KagglWebLogs.zip

## Generate Transactions

In [None]:
from transaction_generator import generateTransactions
outfile = TRANSACTIONS_DIR+"test.log"
generateTransactions(n=2500,OutFile=outfile)

In [None]:
!head {outfile}

In [None]:
!sleep 60

In [None]:
!unzip access.log.zip
!mkdir -p /var/log/httpd/
!mv access.log /var/log/httpd/

## Check Logs

In [None]:
!tail /var/log/aws-kinesis-agent/aws-kinesis-agent.log  

# Run Glue Crawler using aws CLI

In [None]:
%cd {HOME}

In [None]:
!aws --version

In [None]:
!aws glue start-crawler --name tf-cadabra_crawler

In [None]:
!sleep 130

In [None]:
!wget https://raw.githubusercontent.com/tadinve/rsa_on_aws/main/src/python_code/glue_schema_rename.py 

In [None]:
from glue_schema_rename import rename_glue_schema
rename_glue_schema('tf-cadabra_glue','salestransactions')

# Generate Huge Data

In [None]:
#!aws s3 rm --recursive s3://tf-{NAME_INITIALS}-cadabra
#!aws s3 rm --recursive s3://tf-{NAME_INITIALS}-failed-http-logs
#!aws s3 rm --recursive s3://tf-{NAME_INITIALS}-redshift-stage-sales

In [None]:
from transaction_generator import generateTransactions
one_m_file = TRANSACTIONS_DIR+"test_1M.log"
generateTransactions(n=int(10**4.1),OutFile=one_m_file)

In [None]:
!ls -alh /data/

In [None]:
def log_1g(n=3):
    suffix = ["1M","10M","100M","1G","10G","100G","1TB"]
    f = open(one_m_file)
    lines = f.read()
    f_out = open("/data/test_" + suffix[n] + ".csv","w")
    for i in range(10**n):
        f_out.write(lines)
    f_out.close()


In [None]:
for i in range(1,5):
    log_1g(i)

In [None]:
!ls -alh /data/

In [None]:
!mv /data/test_10M.csv /data/test_10M.log

In [None]:
import pprint

In [None]:
import boto3

client = boto3.client('dynamodb','us-west-2')
table = dynamoDBClient.describe_table(TableName='tf-CadabraOrders')
pprint.pprint(table)

In [None]:
import boto3

dynamoDBResource = boto3.resource('dynamodb')
table = dynamoDBResource.Table('tf-CadabraOrders')
print(table.item_count)

In [None]:
client = boto3.client('dynamodb','us-west-2')
response = client.describe_table(TableName='tf-CadabraOrders')
print(response['Table']['ItemCount'])