In [1]:
%pip install confluent_kafka sseclient geopandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
from confluent_kafka import SerializingProducer, DeserializingConsumer
from confluent_kafka.serialization import StringSerializer, StringDeserializer
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4
import os
import sys, random

brokers = os.getenv('KAFKA_ADVERTISED_LISTENERS')
protocol = os.getenv('KAFKA_LISTENER_SECURITY_PROTOCOL_MAP')

In [4]:
pconf = {
    'bootstrap.servers': brokers,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': StringSerializer('utf_8'),
    'security.protocol': 'PLAINTEXT'
}

In [6]:
p = SerializingProducer(pconf)

In [5]:
stocks = "sample.csv"

In [7]:
from datetime import datetime
import geopandas as gpd
import pandas as pd
from shapely.geometry import shape, Point

# Load GeoJSON file into GeoDataFrame
gdf = gpd.read_file('./nyc-boroughs.geojson')
# Ensure geometries are valid
gdf['geometry'] = gdf['geometry'].apply(lambda x: x if x.is_valid else x.buffer(0))

# Create a list of tuples (boroughCode, geometry)
boroughs = gdf.apply(lambda row: (row['boroughCode'], shape(row['geometry'])), axis=1).tolist()

# Define the function to get borough code based on coordinates
def get_borough(longitude, latitude):
    point = Point(longitude, latitude)
    for code, geom in boroughs:
        if geom.contains(point):
            return code
    return None


def construct_stock(row):
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    str_date_time = date_time.strftime("%Y-%m-%dT%H:%M:%SZ") #"%d-%m-%Y, %H:%M:%S"
    stock = {
        #  Some Unique ID for the car (license)
        "hack_license": row[1],
        #  Pick-up location
        "pick_up_location": get_borough(row[10], row[11]),
        #  Drop-off location
        "drop_off_location": get_borough(row[12], row[13]),
        #  Pick-up time
              "pick_up_time": row[5],
        #  Drop-off time
              "drop_off_time": row[6],
              "timestamp":str_date_time
            }
    return stock

In [8]:
import csv, json
import time
n = 0
with open(stocks) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader, None)
    try:
        for row in csv_reader:
            # if n == 100:
            #   break
            stock = construct_stock(row)
            print(stock)
            p.produce('stock', value=json.dumps(stock))
            p.poll(0)
            p.flush()
            time.sleep(0.5)
            # n = n + 1
    except BufferError:
        sys.stderr.write('%% Local producer queue is full (%d messages awaiting delivery): try again\n' % len(p))

    
        
    
    
    

{'hack_license': 'BA96DE419E711691B9445D6A6307C170', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-01 15:11:48', 'drop_off_time': '2013-01-01 15:18:10', 'timestamp': '2024-05-31T12:03:01Z'}
{'hack_license': '9FD8F69F0804BDB5549F40E9DA1BE472', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-06 00:18:35', 'drop_off_time': '2013-01-06 00:22:54', 'timestamp': '2024-05-31T12:03:02Z'}
{'hack_license': '9FD8F69F0804BDB5549F40E9DA1BE472', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-05 18:49:41', 'drop_off_time': '2013-01-05 18:54:23', 'timestamp': '2024-05-31T12:03:03Z'}
{'hack_license': '51EE87E3205C985EF8431D850C786310', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-07 23:54:15', 'drop_off_time': '2013-01-07 23:58:20', 'timestamp': '2024-05-31T12:03:03Z'}
{'hack_license': '51EE87E3205C985EF8431D850C786310', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-07 23:25:03

### Use only if you need to purge all the messages in the queue

In [None]:
brokers = "broker:9092"

admin_client = AdminClient({"bootstrap.servers":brokers})
admin_client.delete_topics(topics=['stock'])