In [None]:
!pip install confluent_kafka

In [1]:
!pip show confluent_kafka

Name: confluent-kafka
Version: 2.8.0
Summary: Confluent's Python client for Apache Kafka
Home-page: https://github.com/confluentinc/confluent-kafka-python
Author: 
Author-email: "Confluent Inc." <support@confluent.io>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


In [None]:
import json
import os
import pandas as pd
from confluent_kafka import Producer

# Setup kafka producer config
conf = {
    'bootstrap.servers': '',
    'security.protocol': 'SASL_SSL',
    'sasl.mechanisms':'PLAIN',
    'sasl.username':'Z',
    'sasl.password':'',
    'client.id':'json-serial-producer'
}

producer = Producer(conf) # Create an instance of the Producer class

# Topic name
topic = "raw_topic"

# Delivery report callback
def delivery_report(err,msg):
    if err:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered successfully! Key: {msg.key()}")

# Read checkpoint        
def read_checkpoint(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            return int(file.read().strip())
    return 0

# Write checkpoint
def write_checkpoint(checkpoint_file, index):
    with open(checkpoint_file, 'w') as file:
        file.write(str(index))
    print(f"Checkpoint updated to line: {index}")

# Handle dates within our json records    
def handle_date(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.strftime('%Y-%m-%d %H:%M:%S')
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
    
# Stream JSON serially
def stream_json_serially(file_path, checkpoint_file='/kaggle/working/checkpoint.txt'):
    last_sent_index = read_checkpoint(checkpoint_file)
    
    with open(file_path,'r') as file:
        for idx, line in enumerate(file):
            if idx < last_sent_index: # Check if the current index is less than the last sent
                continue
                
            try: 
                record = json.loads(line) # Attempt to decode line from json 
                producer.produce( # Call the prodece method
                    topic,
                    key=str(record['review_id']),
                    value=json.dumps(record, default=handle_date).encode('utf-8'), # Applying our handle_date func to ensure any dates are formatted correctly
                    callback=delivery_report
                )
                
                producer.flush() # Ensure that the message is sent to Kafka immediatly
                
                write_checkpoint(checkpoint_file, idx + 1) # Update our checkpoint file with  the new index
                
            except json.JSONDecodeError as e:
                print(f"Failed to decode JSON: {e}")
                
if __name__ == "__main__":
    stream_json_serially('/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json')