In [None]:
%pip install minio

In [1]:
import os
import json
import csv
import time
from kafka import KafkaConsumer
from minio import Minio
from minio.error import S3Error

# Assuming some global variables are defined elsewhere
last_received_time = time.time()
inactivity_timeout = 10  # Set your inactivity timeout (e.g., 10 seconds)

In [2]:
column_names = ["Index", "Type", "Title", "Director", "Cast", "Country", "Release Date", 
                "Year", "Rating", "Duration", "Genre", "Description"]

In [None]:
minio_client = Minio(
    'localhost:9000',  # MinIO's address (replace with your actual MinIO endpoint)
    access_key='access_key',  # MinIO access key
    secret_key='secret_key',  # MinIO secret key
    secure=False  # Set to True if using HTTPS
)

bucket_name = 'my-kafka-bucket'  # Choose a valid name

In [4]:
# Create a Kafka consumer
consumer = KafkaConsumer(
    'NewTopic',  # Replace with the actual topic name
    bootstrap_servers=['localhost:9092'],  # Same as the producer
    group_id='your_consumer_group',  # Consumer group ID (optional)
)

In [42]:
#for c in consumer:
#    print(c.value)

In [None]:
#csv_file_path = 'kafka_output.csv'

#with open(csv_file_path, 'w') as f:
#    for c in consumer:
#        f.write(str(c.value) + '\n')

#print(f"Data saved to {csv_file_path}")

In [None]:
# Kafka server path where you want to save the CSV file
kafka_server_path = 'filepath'

In [6]:
# Function to create the bucket if it doesn't exist
def create_bucket_if_not_exists():
    if not minio_client.bucket_exists(bucket_name):
        minio_client.make_bucket(bucket_name)
        print(f"Bucket {bucket_name} created.")
    else:
        print(f"Bucket {bucket_name} already exists.")

In [7]:
def save_to_minio():
    global last_received_time  # Track inactivity

    # Ensure the bucket exists before uploading data
    create_bucket_if_not_exists()

    # Open the CSV file at the specified path (write mode)
    with open(kafka_server_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        # Write the header with column names
        writer.writerow(column_names)
        print("Header written to CSV.")

        index = 1  # Optional: Index to keep track of row number in the CSV

        while True:
            # Poll for messages with a timeout of 100 ms
            messages = consumer.poll(timeout_ms=1000)

            # If there are no messages, continue the loop
            if not messages:
                print("No messages received.")
                if time.time() - last_received_time > inactivity_timeout:
                    print("Inactivity timeout reached, stopping consumer.")
                    break  # Break the loop if inactivity timeout is reached
                continue

            # Iterate over messages and process them
            for topic_partition, messages_list in messages.items():
                for message in messages_list:
                    print(f"Processing message: {message.value}")

                    # Decode the byte string into a regular string and parse as JSON
                    message_data = json.loads(message.value.decode('utf-8'))

                    if isinstance(message_data, dict):
                        # Prepare row data with default values if keys are missing
                        row = [
                            index,  # Index
                            message_data.get('Type', ''),
                            message_data.get('Title', ''),
                            message_data.get('Director', ''),
                            message_data.get('Cast', ''),
                            message_data.get('Country', ''),
                            message_data.get('Release Date', ''),
                            message_data.get('Year', ''),
                            message_data.get('Rating', ''),
                            message_data.get('Duration', ''),
                            message_data.get('Genre', ''),
                            message_data.get('Description', '')
                        ]
                        
                        # Write the row to the CSV file
                        writer.writerow(row)
                        print(f"Row written: {row}")
                        index += 1  # Increment the index for each row

        # Ensure data is written to disk before uploading
        csvfile.flush()
        os.fsync(csvfile.fileno())  # Force disk flush to ensure data is written

    # Upload the CSV file to MinIO
    file_size = os.path.getsize(kafka_server_path)
    print(f"File size before upload: {file_size} bytes")

    if file_size > 0:  # Only upload if the file has content
        with open(kafka_server_path, 'rb') as f:
            minio_client.put_object(
                bucket_name,
                'kafka_output.csv',  # Object name in MinIO
                f,  # File object
                file_size  # File size in bytes
            )
        print("Data uploaded to MinIO as kafka_output.csv")
    else:
        print("File is empty, skipping upload.")

# Call the function to start consuming and sending data to MinIO
save_to_minio()

Bucket my-kafka-bucket already exists.
Header written to CSV.
Processing message: b'{"Index": 33, "Type": "TV Show", "Title": "Sex Education", "Director": NaN, "Cast": "Asa Butterfield, Gillian Anderson, Ncuti Gatwa, Emma Mackey, Connor Swindells, Kedar Williams-Stirling, Alistair Petrie", "Country": "United Kingdom", "Release Date": "2021-09-17", "Year": 2020.0, "Rating": "TV-MA", "Duration": "3 Seasons", "Genre": "British TV Shows, International TV Shows, TV Comedies", "Description": "Insecure Otis has all the answers when it comes to sex advice, thanks to his therapist mom. So rebel Maeve proposes a school sex-therapy clinic."}'
Row written: [1, 'TV Show', 'Sex Education', nan, 'Asa Butterfield, Gillian Anderson, Ncuti Gatwa, Emma Mackey, Connor Swindells, Kedar Williams-Stirling, Alistair Petrie', 'United Kingdom', '2021-09-17', 2020.0, 'TV-MA', '3 Seasons', 'British TV Shows, International TV Shows, TV Comedies', 'Insecure Otis has all the answers when it comes to sex advice, than

In [None]:
#for c in consumer:
#    print(c.value)