# 1. Producing the data
## implement one Apache Kafka producer to simulate the real-time data transfer from one repository to another.


In [1]:
#importing libraries and statements
import json, time
import itertools
import pandas as pd
from kafka3 import KafkaProducer
from datetime import datetime

- if using pandas to read csv, need to specify dtype=str or need to transform them back to String before sending -> otherwise, pandas would try to infer schema for each column

In [2]:
#reading the data files - produce_data
producer_df = pd.read_csv("produce_data.csv", dtype = str)
producer_df["Date"] = producer_df["Date"].astype('datetime64[ns]')

#selecting rows from the produce_data for year 2011
producer_df = producer_df[producer_df["Date"].dt.year == 2011]
producer_df["Date"] = producer_df["Date"].dt.date

#sorting the data based on the date and store id
producer_df.sort_values(by = ["Date", "Store"], ascending = True, inplace = True)
producer_df.reset_index(inplace = True, drop = True)
producer_df.head(2)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,last_weekly_sales
0,1,2011-01-07,48.27,2.976,,,,,,211.40474,7.742,False,1367320.0062122345
1,10,2011-01-07,43.43,3.287,,,,,,127.19177,8.744,False,1707298.1340065002


In [3]:
#configuration
hostip = "192.168.62.158" #change me

#encoding the list of json data according to utf-8 standard
producer = KafkaProducer(bootstrap_servers = [f'{hostip}:9092'], 
                         value_serializer = lambda x: json.dumps(x).encode('utf-8'), 
                         api_version=(0, 10))


In [None]:
#iterating through the unique dates in the data continuously to get the rows for particular dates
for i in itertools.cycle(producer_df["Date"].unique()):
    
    #getting rows of particular date
    data = producer_df[producer_df["Date"] == i]
    
    #converting the datatype of the data into string format
    data = data.astype(str)
    
    #creating a new column - "ts" with the value as the current time stamp
    # ts holds integer format while the others are of string type
    data["ts"] = int(datetime.timestamp(datetime.now()))
    data.reset_index(drop = True, inplace = True)
    
    #converting the dataframe into a list of dictionaries
    data = data.to_dict(orient = "records")
    
    print("##############################################################################################\n" + str(data))
    
    #sending the data to the kafka stream with the topic name - "kafka producer stream"
    producer.send("kafka_producer_stream", data)
    producer.flush()
    
    #waiting time for 5 seconds before the next list of data is being published
    time.sleep(5)

##############################################################################################
[{'Store': '1', 'Date': '2011-01-07', 'Temperature': '48.27', 'Fuel_Price': '2.976', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '211.40474', 'Unemployment': '7.742', 'IsHoliday': 'false', 'last_weekly_sales': '1367320.0062122345', 'ts': 1675857793}, {'Store': '10', 'Date': '2011-01-07', 'Temperature': '43.43', 'Fuel_Price': '3.287', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '127.19177', 'Unemployment': '8.744', 'IsHoliday': 'false', 'last_weekly_sales': '1707298.1340065002', 'ts': 1675857793}, {'Store': '11', 'Date': '2011-01-07', 'Temperature': '54.43', 'Fuel_Price': '2.976', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '214.69551', 'Unemployment': '7.551', 'IsHoliday': 'false', 'last_weekly_sales': '117

## References: 
- Tutorial week - 9,10, 11

- afsarafsar                    6911 silver badge99 bronze badges (1964) How to convert spark streaming nested JSON coming on Kafka to flat dataframe?, Stack Overflow. Available at: https://stackoverflow.com/questions/46204750/how-to-convert-spark-streaming-nested-json-coming-on-kafka-to-flat-dataframe (Accessed: February 8, 2023). 

- Kukreja, M. (2020) Track real-time gold prices using Apache Kafka, Pandas &amp; matplotlib, Medium. Towards Data Science. Available at: https://towardsdatascience.com/track-real-time-gold-prices-using-apache-kafka-pandas-matplotlib-122a73728a88 (Accessed: February 8, 2023). 

- Structured Streaming + kafka integration guide (kafka broker version 0.10.0 or higher) (no date) Structured Streaming + Kafka Integration Guide (Kafka broker version 0.10.0 or higher) - Spark 3.3.1 Documentation. Available at: https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html (Accessed: February 8, 2023). 

- Structured Streaming Programming Guide (no date) Structured Streaming Programming Guide - Spark 3.3.1 Documentation. Available at: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html (Accessed: February 8, 2023). 