In [2]:
!pip install apache-flink

from pyflink.datastream.connectors.file_system import FileSource, StreamFormat
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.functions import MapFunction
from pyflink.common.watermark_strategy import WatermarkStrategy
from pyflink.datastream import RuntimeExecutionMode
import json

class JsonObjectMapFunction(MapFunction):
    def map(self, value):
        return json.loads(value)

def read_json_as_datastream(file_path: str, env: StreamExecutionEnvironment):
    source = FileSource.for_record_stream_format(StreamFormat.text_line_format(), file_path).build()
    data_stream = env.from_source(source, WatermarkStrategy.no_watermarks(), "txt_source")
    parsed_stream = data_stream.map(JsonObjectMapFunction())
    return parsed_stream



In [3]:
# Path of the data files, change it accordingly
petowners_path = "petowners.jsonl"
pets_path = "pets.jsonl"
products_path = "products.jsonl"
groomers_path = "groomers.jsonl"
users_path = "users.jsonl"
appointments_path = "appointments.jsonl"
services_path = "services.jsonl"

In [None]:
## example for PyFlink DataStream API
env = StreamExecutionEnvironment.get_execution_environment()

# Create the DataStream for users.jsonl
groomers_stream = read_json_as_datastream(groomers_path, env)

# Print each user object and its type
groomers_stream.map(lambda x: print(x, type(x)))

env.execute("sample_stream")


In [None]:
## example for PyFlink Table API
from pyflink.table import EnvironmentSettings, TableEnvironment

# Step 1: Initialize the TableEnvironment
env_settings = EnvironmentSettings.in_batch_mode()
table_env = TableEnvironment.create(env_settings)

table_env.execute_sql("""
    CREATE TABLE pets (
        pet_id STRING,
        name STRING,
        species STRING,
        breed STRING,
        dob STRING,
        owner_id STRING
    ) WITH (
        'connector' = 'filesystem',
        'path' = 'pets.jsonl',
        'format' = 'json'
    )
""")

result_table = table_env.sql_query("""
    SELECT
        *
    FROM pets
""")

with result_table.execute().collect() as results:
    for result in results:
        print(result)

In [None]:
# Q1.a
env = StreamExecutionEnvironment.get_execution_environment()
# Your code here
env.execute("Q1.a")


In [None]:
# Q1.b

In [None]:
# Q1.c


In [None]:
# Q2.A DataStream API

env.execute("Q2.a DataStream API")


In [None]:
# Q1.b
# Your Answer:

In [None]:
# Q1.c
# Your Answer:

In [None]:
# Q2.a DataStream API
env = StreamExecutionEnvironment.get_execution_environment()

# Your code here

env.execute("Q2.a DataStream API")


In [None]:
# Q2.a Table API

# Your code here

In [None]:
# Q2.b DataStream API
env = StreamExecutionEnvironment.get_execution_environment()

# Your code here

env.execute("Q2.b DataStream API")


In [None]:
# Q2.b Table API

# Your code here

In [None]:
# Q2.c DataStream API
env = StreamExecutionEnvironment.get_execution_environment()

# Your code here

env.execute("Q2.c DataStream API")


In [None]:
# Q2.c Table API

# Your code here

In [None]:
# Q2.d
env = StreamExecutionEnvironment.get_execution_environment()

# Your code here

env.execute("Q2.d")


In [None]:
# Q2.e
env = StreamExecutionEnvironment.get_execution_environment()

# Your code here

env.execute("Q2.e")
