# 1. Producing the data (10%)
In this task, we will implement one Apache Kafka producer to simulate the real-time data
transfer from one repository to another.
## Important:
- Do not use Spark in this task
- In this part, all columns should be string type

Your program should send a random number (10~30, including 10 and 30) of client data
every 5 seconds to the Kafka stream in 2 different topics based on their origin files.
- For example, if the first random batch of customers' IDs is 1,2, and 3, you should also
send bureau data of them to the bureau topic. For every batch of data, you need to
add a new column 'ts', the current timestamp. The data in the same batch should
have the same timestamp.
For instance: batch1: [{ID=xxx,...,ts=123456}, {ID=xxx,...,ts=123456},......]
↑
one row

In [1]:
import random # random number (10~30, including 10 and 30)
import pandas as pd #processing data
from time import sleep # every 5 seconds
import datetime as dt # add a new column 'ts'
from json import dumps
from kafka import KafkaProducer


In [2]:
customer = pd.read_csv('customer.csv')
# For instance format, we want push as key_value pair,so we need to use pd.dataframe.to_dict to change the format.
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_dict.html

customer_list = customer.to_dict(orient='records')

In [3]:
bureau = pd.read_csv('bureau.csv')
# For instance format, we want push as key_value pair,so we need to use pd.dataframe.to_dict to change the format.
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_dict.html


In [4]:
bureau_colums = bureau.columns
bureau_colums

Index(['ID', 'SELF-INDICATOR', 'MATCH-TYPE', 'ACCT-TYPE', 'CONTRIBUTOR-TYPE',
       'DATE-REPORTED', 'OWNERSHIP-IND', 'ACCOUNT-STATUS', 'DISBURSED-DT',
       'CLOSE-DT', 'LAST-PAYMENT-DATE', 'CREDIT-LIMIT/SANC AMT',
       'DISBURSED-AMT/HIGH CREDIT', 'INSTALLMENT-AMT', 'CURRENT-BAL',
       'INSTALLMENT-FREQUENCY', 'OVERDUE-AMT', 'WRITE-OFF-AMT', 'ASSET_CLASS',
       'REPORTED DATE - HIST', 'DPD - HIST', 'CUR BAL - HIST',
       'AMT OVERDUE - HIST', 'AMT PAID - HIST', 'TENURE'],
      dtype='object')

In [5]:
for k, v in bureau.groupby("ID"):
    if k == 1:
        print(v)
    else:
        break
    

   ID  SELF-INDICATOR MATCH-TYPE             ACCT-TYPE CONTRIBUTOR-TYPE  \
0   1           False    PRIMARY  Auto Loan (Personal)              NAB   
1   1           False    PRIMARY             Overdraft              NAB   
2   1            True    PRIMARY  Auto Loan (Personal)              NBF   
3   1            True    PRIMARY  Auto Loan (Personal)              NBF   
4   1            True    PRIMARY          Tractor Loan              NBF   
5   1           False    PRIMARY           Credit Card              NAB   
6   1            True    PRIMARY  Auto Loan (Personal)              NBF   
7   1           False    PRIMARY  Auto Loan (Personal)              NAB   
8   1            True    PRIMARY          Tractor Loan              NBF   

         DATE-REPORTED OWNERSHIP-IND ACCOUNT-STATUS         DISBURSED-DT  \
0  2019-12-31 00:00:00    Individual         Active  2018-03-19 00:00:00   
1  2018-04-30 00:00:00    Individual     Delinquent  2015-10-05 00:00:00   
2  2017-07-31 00:00:0

In [6]:
bureau_groupbyID = dict()
for key, value in bureau.groupby("ID"):
    bureau_groupbyID[key] = []
    for v in value.values:
        bureau_groupbyID[key].append(dict(zip(bureau_colums,v)))

In [7]:
#copy from lab task
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [9]:
if __name__ == '__main__':
   
    burtopic = 'topic bureau'
    custopic = 'topic customer'
    
    print('Publishing records..')
    producer = connect_kafka_producer()
    
    # 2 different topics based on their origin files
#create customer_data empty list to save the random data
#use customer_data ID to search Key-ID in bureau_groupbyID,then add the data in bureau_data

    while True:
        bureau_data_list = [] #Initialize each outgoing data
        customer_data_list = [] #Initialize each outgoing data
        random_send_num = random.randint(10,30) 
        for customer in random.sample(customer_list,random_send_num):#Select 10-30 from customer data at a time for the loop
            customer['ts'] = int(dt.datetime.now().timestamp())
            customer_data_list.append(customer)
            
            customer_bureau_list = bureau_groupbyID[customer["ID"]]
            for item in customer_bureau_list: # use choose customer id as search key, find bureau data.
                item['ts'] = int(dt.datetime.now().timestamp())
                bureau_data_list.append(item) #add data to bureau push list and also add the timestamp
    
        print(customer_data_list)
        print(bureau_data_list)
        #print("######################")
        #print(customer_data_list[0:2]])
        #print("bureau_data:")
        #print(bureau_data)
        publish_message(producer,custopic,customer_data_list)# use publish function to publish
        print(len(customer_data_list))
        print('++++++++++++++++++++++++++++++++++++')
        publish_message(producer,burtopic,bureau_data_list)# use publish function to publish
        print('+++++++++++++++++++++++++++++++++++++')
        sleep(5)

Publishing records..
[{'ID': 96581, 'Frequency': 'Quatrly', 'InstlmentMode': 'Arrear', 'LoanStatus': 'Closed', 'PaymentMode': 'ECS', 'BranchID': 24, 'Area': 'KANPUR', 'Tenure': 36, 'AssetCost': 550000, 'AmountFinance': 250000.0, 'DisbursalAmount': 250000.0, 'EMI': 20000.0, 'DisbursalDate': '2016-10-31 00:00:00', 'MaturityDAte': '2019-09-10 00:00:00', 'AuthDate': '2016-10-31 00:00:00', 'AssetID': 22465626, 'ManufacturerID': 1186.0, 'SupplierID': 91737, 'LTV': 41.82, 'SEX': 'M', 'AGE': 42.0, 'MonthlyIncome': 41666.67, 'City': 'KANPUR DEHAT', 'State': 'UTTAR PRADESH', 'ZiPCODE': 209204.0, 'Top-up Month': 'No Top-up Service', 'ts': 1666500955}, {'ID': 54399, 'Frequency': 'Half Yearly', 'InstlmentMode': 'Arrear', 'LoanStatus': 'Closed', 'PaymentMode': 'ECS', 'BranchID': 145, 'Area': 'NELLORE', 'Tenure': 48, 'AssetCost': 690000, 'AmountFinance': 450000.0, 'DisbursalAmount': 450000.0, 'EMI': 75567.0, 'DisbursalDate': '2016-10-31 00:00:00', 'MaturityDAte': '2020-08-05 00:00:00', 'AuthDate': '2

Exception in publishing message.
KafkaTimeoutError: Failed to update metadata after 60.0 secs.
25
++++++++++++++++++++++++++++++++++++


KeyboardInterrupt: 