In [1]:
%pip install confluent_kafka sseclient geopandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
from confluent_kafka import SerializingProducer, DeserializingConsumer
from confluent_kafka.serialization import StringSerializer, StringDeserializer
from confluent_kafka.admin import AdminClient, NewTopic
from uuid import uuid4
import os
import sys, random

brokers = os.getenv('KAFKA_ADVERTISED_LISTENERS')
protocol = os.getenv('KAFKA_LISTENER_SECURITY_PROTOCOL_MAP')

In [3]:
pconf = {
    'bootstrap.servers': brokers,
    'partitioner': 'murmur2_random',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': StringSerializer('utf_8'),
    'security.protocol': 'PLAINTEXT'
}

In [4]:
p = SerializingProducer(pconf)

In [5]:
stocks = "sample.csv"

In [6]:
from datetime import datetime
import geopandas as gpd
import pandas as pd
from shapely.geometry import shape, Point

# Load GeoJSON file into GeoDataFrame
gdf = gpd.read_file('./nyc-boroughs.geojson')
# Ensure geometries are valid
gdf['geometry'] = gdf['geometry'].apply(lambda x: x if x.is_valid else x.buffer(0))

# Create a list of tuples (boroughCode, geometry)
boroughs = gdf.apply(lambda row: (row['boroughCode'], shape(row['geometry'])), axis=1).tolist()

# Define the function to get borough code based on coordinates
def get_borough(longitude, latitude):
    point = Point(longitude, latitude)
    for code, geom in boroughs:
        if geom.contains(point):
            return code
    return None


def construct_stock(row):
    time_stamp = time.time()
    date_time = datetime.fromtimestamp(time_stamp)
    str_date_time = date_time.strftime("%Y-%m-%dT%H:%M:%SZ") #"%d-%m-%Y, %H:%M:%S"
    stock = {
        #  Some Unique ID for the car (license)
        "hack_license": row[1],
        #  Pick-up location
        "pick_up_location": get_borough(row[10], row[11]),
        #  Drop-off location
        "drop_off_location": get_borough(row[12], row[13]),
        #  Pick-up time
              "pick_up_time": row[5],
        #  Drop-off time
              "drop_off_time": row[6],
              "timestamp":str_date_time
            }
    return stock

In [7]:
import csv, json
import time
from itertools import islice

n = 0
with open(stocks) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader, None)
    csv_reader = islice(csv_reader, 70572, None) 

    try:
        for row in csv_reader:
            # if n == 100:
            #   break
            stock = construct_stock(row)
            print(stock)
            p.produce('stock', value=json.dumps(stock))
            p.poll(0)
            p.flush()
            time.sleep(0.5)
            # n = n + 1
    except BufferError:
        sys.stderr.write('%% Local producer queue is full (%d messages awaiting delivery): try again\n' % len(p))    

{'hack_license': 'CE2E9A5E5D50EFDA8D537F599455450E', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-13 15:58:00', 'drop_off_time': '2013-01-13 16:03:00', 'timestamp': '2024-06-06T06:53:09Z'}
{'hack_license': 'D3F049757A7934973F22A4E15A909581', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-13 16:03:00', 'drop_off_time': '2013-01-13 16:05:00', 'timestamp': '2024-06-06T06:53:10Z'}
{'hack_license': '23CE32E836BB0C56445E7674F27D0B60', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-13 16:30:00', 'drop_off_time': '2013-01-13 16:39:00', 'timestamp': '2024-06-06T06:53:10Z'}
{'hack_license': 'CFBFA444D13FFEF8A8D50641EB302873', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-13 15:51:00', 'drop_off_time': '2013-01-13 15:55:00', 'timestamp': '2024-06-06T06:53:11Z'}
{'hack_license': 'BEB9BC15F50159818594E541127EB304', 'pick_up_location': 1, 'drop_off_location': 1, 'pick_up_time': '2013-01-13 15:50:00

### Use only if you need to purge all the messages in the queue

In [None]:
brokers = "broker:9092"

admin_client = AdminClient({"bootstrap.servers":brokers})
admin_client.delete_topics(topics=['stock'])