In [None]:
# Task 1.1
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
import csv

# accepts filename and reads the data as a dictionary
# returning it in the form of a list of dictionaries
def read_csv(fileName):
    
    data = []
    input_file = csv.DictReader(open(fileName))
    for row in input_file:
        data.append(row)
    
    return data

# starts the stream by sending the data
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)

    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
# encodes the stream and connects to the kafka server        
def connect_memory_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

# custom function to transform the data
# sends random number of rows per machine
def mac_count(cRows, mac, start_index, num_rows):
    # to store rows
    row_list = []
    # to keep a check on count of rows in the list
    count = 0
    while count != num_rows:
        # runs for start index to the length of data
        for i in range(start_index, len(cRows)):
            # appends the row to list for a machine
            if cRows[i].get('machine') == mac:
                row_list.append(cRows[i])            
                count += 1
            # if count is equal to the number of rows to be sent, break
            if count == num_rows:
                break
        # increasing the start index
        start_index = start_index + (i + 1)
        # reset to start from begining
        if start_index >= len(cRows):
            start_index = 0
    # returns the start index to the specific machine key as well as the rows to be sent
    return start_index, row_list
    
# main    
if __name__ == '__main__':
    # for process data
    topic = 'Process'
    # reading the data using read_csv function defined above
    cRows = read_csv('Streaming_Linux_process.csv')
    # connecting the producer
    producer = connect_memory_producer()
    # getting a list of values in machine
    uniq_mac = []
    for i in range(len(cRows)):
        uniq_mac.append(cRows[i].get('machine'))
    # unique values in machine
    uniq_mac = list(set(uniq_mac))
    # start index for each machine initialised to 0
    mac_dict = {mac:0 for mac in uniq_mac}
    
    while True:             
        #define the current timestamp
        ts = {'ts': int(dt.datetime.now().timestamp())}
        data = []
        # for each machine send a random number of rows
        for key in mac_dict:
            num_rows = random.randint(10, 50)
            mac_dict[key], row_list = mac_count(cRows, key, mac_dict[key], num_rows)
            data.extend(row_list)
        # appending ts with the data to be sent    
        data = [dict(item,**ts) for item in data]
        # sending the data
        publish_message(producer, topic, data)      
        # sleep for 5 seconds
        sleep(5)