In [1]:
import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment

import get_env
env = get_env.get_env("remote")
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "1")

2025-08-31T13:46:09.868589Z main ERROR Reconfiguration failed: No configuration found for '3a4afd8d' at 'null' in 'null'
2025-08-31T13:46:10.986270Z Thread-3 ERROR Reconfiguration failed: No configuration found for '25a4846d' at 'null' in 'null'


<pyflink.table.table_config.TableConfig at 0x7fa3cebf7160>

In [2]:
 

# Clean slate
t_env.execute_sql("DROP TABLE IF EXISTS sentences")
t_env.execute_sql("DROP TABLE IF EXISTS word_count_print")

# Kafka source: one sentence per record (VALUE only)
t_env.execute_sql("""
CREATE TABLE sentences (
  msg STRING
) WITH (
  'connector' = 'kafka',
  'topic' = 'sentence',
  'properties.bootstrap.servers' = 'broker:9092',
  'properties.group.id' = 'flink-sentences-5',
  'scan.startup.mode' = 'earliest-offset',
  'value.format' = 'raw'
)
""")

# Print sink (built-in test connector)
t_env.execute_sql("""
CREATE TABLE word_count_print (
  word STRING,
  cnt  BIGINT
) WITH (
  'connector' = 'print'
)
""")

# WordCount: split → normalize → group → count → print
t_env.execute_sql("""
INSERT INTO word_count_print
SELECT word, COUNT(*) AS cnt
FROM (
  SELECT LOWER(TRIM(w)) AS word
  FROM sentences
  CROSS JOIN UNNEST(SPLIT(msg, ' ')) AS t(w)
)
WHERE word <> ''
GROUP BY word
""")


<pyflink.table.table_result.TableResult at 0x7fa3cb5ccfd0>

In [None]:
"""
docker exec -it kafka-tools bash 

kafka-topics --bootstrap-server broker:9092 --create --topic sentence --partitions 1 --replication-factor 1  


kafka-console-producer --bootstrap-server broker:9092 --topic sentence 

kafka-console-consumer --bootstrap-server broker:9092 --topic sentence --from-beginning
"""

In [3]:
from pyflink.table import EnvironmentSettings, TableEnvironment

t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
t_env.get_config().set("execution.target", "remote")
t_env.get_config().set("rest.address", "jobmanager")
t_env.get_config().set("rest.port", "8081")
t_env.get_config().set("pipeline.name", "windowed-wordcount-to-kafka")

# Source with processing-time column
t_env.execute_sql("DROP TABLE IF EXISTS sentences")
t_env.execute_sql("""
CREATE TABLE sentences (
  msg STRING,
  ts AS PROCTIME()
) WITH (
  'connector' = 'kafka',
  'topic' = 'sentence',
  'properties.bootstrap.servers' = 'broker:9092',
  'properties.group.id' = 'wc-win-src',
  'scan.startup.mode' = 'earliest-offset',
  'value.format' = 'raw'
)
""")

# Sink: append-only, key=word (raw), value=cnt (raw), exclude key from value payload
t_env.execute_sql("DROP TABLE IF EXISTS word_count_out")
t_env.execute_sql("""
CREATE TABLE word_count_out (
  word STRING,
  cnt  STRING
) WITH (
  'connector' = 'kafka',
  'topic' = 'word-count',
  'properties.bootstrap.servers' = 'broker:9092',
  'key.format' = 'raw',
  'key.fields' = 'word',
  'value.format' = 'raw',
  'value.fields-include' = 'EXCEPT_KEY'
)
""")

# 10-second tumbling window -> append-only rows
t_env.execute_sql("""
INSERT INTO word_count_out
SELECT word, CAST(COUNT(*) AS STRING) AS cnt
FROM (
  SELECT LOWER(TRIM(w)) AS word, ts
  FROM sentences
  CROSS JOIN UNNEST(SPLIT(msg, ' ')) AS t(w)
  WHERE TRIM(w) <> ''
) s
GROUP BY TUMBLE(ts, INTERVAL '10' SECOND), word
""")

<pyflink.table.table_result.TableResult at 0x7fa39f838340>