# MASS / MASA Kinesis - Lambda

In the first step we need to import all required packages and modules into the Python Path

The Pilot-Compute Description is a simple key/value style description of the cluster environment that should be started. 

In [None]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd
import datetime
## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
 
import pyspark
import os
import boto3
boto3.setup_default_session(profile_name='dev')
import time
import mass.kafka
    
# Pilot-Streaming
import pilot.streaming
sys.modules['pilot.streaming']

# 1. Kinesis

Start Spark Cluster and Wait for Startup Completion

In [None]:
%%time
pilot_compute_description = {
    "resource":"kinesis://awscloud.com",
    "number_cores": 1,
    "type":"kinesis"
}
kinesis_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
kinesis_pilot.wait()
kinesis_details=kinesis_pilot.get_details()

# Lambda

In [None]:
def lambda_handler(event, context):
    import numpy as np
    import pickle
    import sklearn.cluster
    import boto3
    import sys
    import traceback
    import base64
    import time
    #print("Lambda Function called")
    for record in event['Records']:
        try:
            #print(str(record))
            start = time.time()
            payload=base64.b64decode(record["kinesis"]["data"])
            #print("Payload:" + str(payload))
            data=pickle.loads(payload)
            num_points=data.shape[0]
            num_dim=data.shape[1]
            #print(str(record["kinesis"].keys())) 
            broker_time = record["kinesis"]['approximateArrivalTimestamp']
            #print(str(broker_time))
            #print("Decoded payload: " + str(data))
            kmeans_model = sklearn.cluster.MiniBatchKMeans(n_clusters=2)
            kmeans_model = kmeans_model.partial_fit(data)
            end = time.time()
            #print("Centers: " + str(kmeans_model.cluster_centers_))
            print("Context Information:", context.aws_request_id, context.log_group_name, context.log_stream_name )
            print("Measurement, Request ID, Log Group, Log Stream, Cores,Number Points, Number Dimensions, Processing Time, Latency")
            print("LambdaKMeans, %s, %s, %s, 1, %d, %d, %.5f, %.5f"%(context.aws_request_id, 
                                                                context.log_group_name, 
                                                                context.log_stream_name,
                                                                num_points,
                                                                num_dim,
                                                                (end-start),
                                                                end-broker_time
                                                               ))
        except: 
            print("Error sending message")
            exc_type, exc_value, exc_traceback = sys.exc_info()
            print("*** print_tb: ")
            traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
            print("*** print_exception: ")

                  

In [None]:
%%time
pilot_compute_description = {
    "resource":"kinesis://awscloud.com",
    "number_cores": 1,
    "lambda_input_data": kinesis_pilot.get_id(),
    "lambda_function": lambda_handler,
    "lambda_layer": os.path.join("../layers", "sklearn.zip"),
    "type":"lambda"
}
lambda_pilot = pilot.streaming.PilotComputeService.create_pilot(pilot_compute_description)
lambda_pilot.wait()
lambda_pilot.get_details()

# MiniApp Data Generator

Produce some Test Data for K-Means
Produce some more data for tests

In [None]:
miniapp=mass.kafka.MiniApp(
                            dask_scheduler=None,
                            resource_url=kinesis_details["master_url"],
                            broker_service="kinesis",
                            number_parallel_tasks=1,
                            number_clusters=3,
                            number_points_per_cluster=1000,
                            number_points_per_message=1000,
                            number_messages=1,
                            number_dim=3,
                            number_produces=100,
                            number_partitions=1,
                            topic_name="test",
                            application_type="kmeans"
                           )

In [None]:
miniapp.run()

In [None]:
%%time
lambda_pilot.cancel()
kinesis_pilot.cancel()

# Scratch

Code for manual receiving messages from Kinesis

In [None]:
boto3.setup_default_session(profile_name='dev')
kinesis_client = boto3.client('kinesis', region_name='us-east-1')
stream_name = kinesis_details["master_url"].split("/")[1]
print("Stream Name: %s"%stream_name)
stream = kinesis_client.describe_stream(StreamName=stream_name)['StreamDescription']

messages = []
for shard in stream['Shards']:
    print("### %s - %s"%(stream_name, shard['ShardId']))
    shard_iterator = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard['ShardId'],
        ShardIteratorType='AT_TIMESTAMP',  #'TRIM_HORIZON'|'LATEST'
        Timestamp=datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
    )['ShardIterator']

    out = kinesis_client.get_records(ShardIterator=shard_iterator, Limit=1000)
    if out["Records"]:
        for record in out["Records"]:
            #data = json.loads()
            messages.append(record["Data"])
    else:
        print(out)
        time.sleep(1)

In [None]:
import pickle
m=pickle.loads(messages[1])

In [None]:
m.shape

Code for manual sending messages to Kinesis

In [None]:
kinesis_client = boto3.client('kinesis', region_name='us-east-1')
put_response = kinesis_client.put_record(
                        StreamName=kinesis_pilot.get_id().split("/")[1],
                        Data="Hello World",
                        PartitionKey="A")

In [None]:
kinesis_event={
  "Records": [
    {
      "kinesis": {
        "partitionKey": "partitionKey-03",
        "kinesisSchemaVersion": "1.0",
        "data": messages[-1],
        "sequenceNumber": "49545115243490985018280067714973144582180062593244200961",
        "approximateArrivalTimestamp": 1428537600
      },
      "eventSource": "aws:kinesis",
      "eventID": "shardId-000000000000:49545115243490985018280067714973144582180062593244200961",
      "invokeIdentityArn": "arn:aws:iam::EXAMPLE",
      "eventVersion": "1.0",
      "eventName": "aws:kinesis:record",
      "eventSourceARN": "arn:aws:kinesis:EXAMPLE",
      "awsRegion": "us-east-1"
    }
  ]}
lambda_handler(kinesis_event, None)