In [66]:
import names
import time
import random
import pgeocode
import csv
import requests
import json
import pgeocode

## Generate Pinot Table (Prerequisite: Schema has already been created)

In [85]:
v_path_to_table_config = "trips/trips_realtime_table_config.json"

In [86]:
response = requests.post('http://pinot-controller.pinot:9000/tables', json=json.load(open(v_path_to_table_config)))
print(response)
print(response.text)

<Response [200]>
{"status":"Table trips_REALTIME succesfully added"}


## Kafka Create Topic 

In [105]:
from kafka.admin import KafkaAdminClient, NewTopic

admin_client = KafkaAdminClient(
    bootstrap_servers="pinot-kafka.pinot:9092", 
    client_id='test'
)

In [106]:
# delete old Kafka Topic
admin_client.delete_topics(topics=["trips"])

DeleteTopicsResponse_v3(throttle_time_ms=0, topic_error_codes=[(topic='trips', error_code=0)])

In [107]:
# create new topic with settings
# admin_client.create_topics(new_topics=[NewTopic(name="trips", num_partitions=1, replication_factor=1)], validate_only=False)

## Kafka Producer - Generate Data

In [108]:
# Choose random city of file containing German cities with postcode
geocode_file = open('./pgeocode_data/DE.txt')
geocode_list = list(csv.reader(geocode_file))[1:] # skip first line (header)
random.shuffle(geocode_list)
geocode_list = geocode_list[:1000] # take only random 1000 places to generate more overlapping data
geocode_file.close()

def choose_random_city():
    return random.choice(geocode_list)

# generate only 1000 driver/rider names to generate more overlapping data
names_list = []
for i in range(1000):
    names_list.append(names.get_full_name())

def choose_random_name():
    return random.choice(names_list)
    
# Generation of License Plate
# create a pool of letters to choose from
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
numbers = '0123456789'

def generate_license_plate():
    # generate 3 randomly chosen letters, L1, L2, L3
    L1 = random.choice(letters)
    L2 = random.choice(letters)
    L3 = random.choice(letters)
    L4 = random.choice(letters)
    # generate 4 randomly chosen numbers, N1, N2, N3, N4
    N1 = random.choice(numbers)
    N2 = random.choice(numbers)
  
    # combine it together into one print function
    return(L1+L2+'-'+L3+L4+'-'+N1+N2)

# Calculation of price based on distance between start city and end destination
def calculate_price(v_distance):
    v_multiplicator=round(random.uniform(0.8, 2.0),2)
    v_price=round(v_distance*v_multiplicator,2)
    return(v_price)

In [109]:
dist = pgeocode.GeoDistance('de')

In [110]:
from kafka import KafkaProducer

producer = KafkaProducer(bootstrap_servers=['pinot-kafka.pinot:9092'], value_serializer=lambda v: json.dumps(v).encode('utf-8'))

In [111]:
# begin generating trips data at current time
start_timestamp_ms = time.time_ns() // 1000000

# Generate data
num_records = 100000 + random.randint(5000,10000)
for i in range(num_records):
    v_start_location=choose_random_city()
    v_end_location=choose_random_city()
#     #if travelling in one postcode area
#     if v_start_location[1] == v_start_location[2]:
#         v_distance=random(0.5,6)
#     else:
#         v_distance=dist.query_postal_code(v_start_location[1], v_end_location[1])
    v_distance = random.randint(5,1000)

    # add random jitter, in large system our event stream is probably also not strictly sorted
    v_requesttime = start_timestamp_ms + i*1000 + random.randint(0,100);

    v_waiting_time_millis = random.randint(1,3600000)
    v_trip_time = round((v_distance/random.randint(45,60)) * 60 *60*1000)

    record = {
        "rider_name": choose_random_name(),
        "driver_name": choose_random_name(),
        "license_plate":generate_license_plate(),
        "start_location": v_start_location[2],
        "start_zip_code": v_start_location[1],
        "start_location_state": v_start_location[3],
        "end_location": v_end_location[2],
        "end_zip_code": v_end_location[1],
        "end_location_state": v_end_location[3],
        "rider_is_premium": random.randint(0, 1),
        "count": 1,
        "payment_amount": calculate_price(v_distance),
        "payment_tip_amount": random.randint(5,50),
        "trip_wait_time_millis": v_waiting_time_millis,
        "rider_rating": random.randint(0,5),
        "driver_rating": random.randint(0,5),
        "trip_start_time_millis": v_requesttime+v_waiting_time_millis,
        "request_time_millis": v_requesttime,
        "trip_end_time_millis": v_requesttime+v_waiting_time_millis+v_trip_time
    }

    producer.send('trips', value=record)
    
    if i % 5000 == 0:
        print(f'{i} records generated')
print(f'{num_records} records generated')

0 records generated
5000 records generated
10000 records generated
15000 records generated
20000 records generated
25000 records generated
30000 records generated
35000 records generated
40000 records generated
45000 records generated
50000 records generated
55000 records generated
60000 records generated
65000 records generated
70000 records generated
75000 records generated
80000 records generated
85000 records generated
90000 records generated
95000 records generated
100000 records generated
105000 records generated
106318 records generated
