In [3]:
# PyFlink DataStream DAG demo (Flink 1.20, Python)
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.common.typeinfo import Types
import json

import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment

In [4]:
# DAG - Directed Acylic Graph, a chained operation for Data Flow
# DAG consisted of pipeline and processors, processoros are nothing but logic applied

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)

# Optional: show every operator as a separate vertex (nice for DAG clarity)
env.disable_operator_chaining()



<pyflink.datastream.stream_execution_environment.StreamExecutionEnvironment at 0x7fdb9f8d3160>

In [5]:

# Tiny pipeline
ds = env.from_collection(list(range(10)), type_info=Types.INT())
even = (ds
        .map(lambda x: x + 1, output_type=Types.INT())
        .filter(lambda x: x % 2 == 0))

# --- A) Print DAG/plan (JSON) to console without running the job ---
plan_json = env.get_execution_plan()          # returns a JSON string
print(plan_json)
print("\nPretty:\n", json.dumps(json.loads(plan_json), indent=2))


{
  "nodes" : [ {
    "id" : 1,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 4,
    "type" : "Map, Filter",
    "pact" : "Operator",
    "contents" : "Map, Filter",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 1,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  } ]
}

Pretty:
 {
  "nodes": [
    {
      "id": 1,
      "type": "Source: Collection Source",
      "pact": "Data Source",
      "contents": "Source: Collection Source",
      "parallelism": 1
    },
    {
      "id": 4,
      "type": "Map, Filter",
      "pact": "Operator",
      "contents": "Map, Filter",
      "parallelism": 1,
      "predecessors": [
        {
          "id": 1,
          "ship_strategy": "FORWARD",
          "side": "second"
        }
      ]
    }
  ]
}


In [7]:

# --- B) Run so you can see it on the Dashboard UI (http://localhost:8081) ---
# Printing to stdout just to have a running sink:
even.print()

# now we can see sink in the data flow graph

plan_json = env.get_execution_plan()          # returns a JSON string
print(plan_json)
print("\nPretty:\n", json.dumps(json.loads(plan_json), indent=2))


env.execute("pyflink-dag-demo-datastream")

{
  "nodes" : [ {
    "id" : 1,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 4,
    "type" : "Map, Filter",
    "pact" : "Operator",
    "contents" : "Map, Filter",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 1,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 6,
    "type" : "Map, Filter",
    "pact" : "Operator",
    "contents" : "Map, Filter",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 1,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 8,
    "type" : "Map, Filter",
    "pact" : "Operator",
    "contents" : "Map, Filter",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 1,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 5,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to

<pyflink.common.job_execution_result.JobExecutionResult at 0x7fdb9f99f0d0>

In [8]:
from pyflink.datastream.functions import MapFunction, RuntimeContext
from pyflink.common.typeinfo import Types
import socket

In [9]:
# THIS IS Demonstration to explain how it works internally in Task manager
from pyflink.common.typeinfo import Types
import socket, os, threading, json

class WhoAmI(MapFunction):
    def open(self, rc: RuntimeContext):
        self.subtask = rc.get_index_of_this_subtask()
        self.host = socket.gethostname()
    def map(self, x):
        return f"host={self.host} subtask={self.subtask} value={x}"


class Inspect(MapFunction):
    def open(self, rc: RuntimeContext):
        # RuntimeContext-based task info
        self.task_name = rc.get_task_name()                       # e.g. "Map"
        self.task_name_ws = rc.get_task_name_with_subtasks()      # e.g. "Map (2/3)"
        self.subtask_index = rc.get_index_of_this_subtask()       # 0..p-1
        self.parallelism = rc.get_number_of_parallel_subtasks()   # p
        self.attempt = rc.get_attempt_number()                    # 0 on first try

        # Environment/process info
        self.host = socket.gethostname()                          # TM host/container
        self.pid = os.getpid()                                    # Python worker PID
        self.thread_id = threading.get_ident()                    # Python thread id

    def map(self, x):
        return json.dumps({
            "value": x,
            "host": self.host,
            "pid": self.pid,
            "thread_id": self.thread_id,
            "task_name": self.task_name,
            "task_name_with_subtasks": self.task_name_ws,
            "subtask_index": self.subtask_index,
            "parallelism": self.parallelism,
            "attempt": self.attempt
        })


In [23]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  

# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

# key_by uses hash partitioning subtask = f(hash(key), maxParallelism, parallelism) via key-groups.
# key_by same key-group â†’ one subtask
# Try different partitioners one by one to SEE how routing changes:
stream = src.key_by(lambda kv: kv[0])          # KEYED (by first field)
 
stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(3).print()
print(env.get_execution_plan())

env.execute("dag-partition-demo-keyby")

{
  "nodes" : [ {
    "id" : 109,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 110,
    "type" : "_stream_key_by_map_operator",
    "pact" : "Operator",
    "contents" : "_stream_key_by_map_operator",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 109,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 112,
    "type" : "Map",
    "pact" : "Operator",
    "contents" : "Map",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 110,
      "ship_strategy" : "HASH",
      "side" : "second"
    } ]
  }, {
    "id" : 113,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to Std. Out",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 112,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  } ]
}


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f932534a8f0>

In [15]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  


# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))
 
stream = src.rebalance()                     # round-robin
 


# stream.map(WhoAmI(), output_type=Types.STRING()).print()
stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(3).print()
print(env.get_execution_plan())

env.execute("dag-partition-rebalance")

{
  "nodes" : [ {
    "id" : 69,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 71,
    "type" : "Map",
    "pact" : "Operator",
    "contents" : "Map",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 69,
      "ship_strategy" : "REBALANCE",
      "side" : "second"
    } ]
  }, {
    "id" : 72,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to Std. Out",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 71,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  } ]
}


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f93253498a0>

In [16]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  


# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

 
stream = src.rescale()                       # partial round-robin (upstream->subset downstream)
 

stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(6).print()
print(env.get_execution_plan())

env.execute("dag-partition-rescale")

{
  "nodes" : [ {
    "id" : 75,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 77,
    "type" : "Map",
    "pact" : "Operator",
    "contents" : "Map",
    "parallelism" : 6,
    "predecessors" : [ {
      "id" : 75,
      "ship_strategy" : "RESCALE",
      "side" : "second"
    } ]
  }, {
    "id" : 78,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to Std. Out",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 77,
      "ship_strategy" : "REBALANCE",
      "side" : "second"
    } ]
  } ]
}


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f932251c160>

In [17]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  


# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))


stream = src.shuffle()                       # random

stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(3).print()
print(env.get_execution_plan())

env.execute("dag-partition-shuffle")

{
  "nodes" : [ {
    "id" : 81,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 83,
    "type" : "Map",
    "pact" : "Operator",
    "contents" : "Map",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 81,
      "ship_strategy" : "SHUFFLE",
      "side" : "second"
    } ]
  }, {
    "id" : 84,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to Std. Out",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 83,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  } ]
}


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f932254ada0>

In [9]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  


# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

stream = src.broadcast()                     # duplicates to all downstream subtasks

stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(3).print()
print(env.get_execution_plan())

env.execute("dag-partition-broadcast")

{
  "nodes" : [ {
    "id" : 49,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 51,
    "type" : "Map",
    "pact" : "Operator",
    "contents" : "Map",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 49,
      "ship_strategy" : "BROADCAST",
      "side" : "second"
    } ]
  }, {
    "id" : 52,
    "type" : "Sink: Print to Std. Out",
    "pact" : "Data Sink",
    "contents" : "Sink: Print to Std. Out",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 51,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  } ]
}


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f93279928f0>

In [20]:
env = get_env.get_remote_env()
env.set_parallelism(3)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "3")
env.disable_operator_chaining()  

from pyflink.datastream.functions import Partitioner


# pretend "splits": 3 buckets -> observe routing differences below
src = env.from_collection([("a",1),("b",2),("c",3),("a",4),("b",5),("c",6)],
                          type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

# Try different partitioners one by one to SEE how routing changes:
stream = src.key_by(lambda kv: kv[0])          # KEYED (by first field)


class OddEvenPartitioner(Partitioner):
    def partition(self, key, num_partitions: int) -> int:
        # send even keys to subtask 0, odd to subtask 1 (mod to be safe)
        return (key % 2) % num_partitions

# choose which part of the record becomes the "key" fed to the partitioner:
# custom partition based on key or value
stream = src.partition_custom(OddEvenPartitioner(), key_selector=lambda r: r[1])

 
stream.map(Inspect(), output_type=Types.STRING()).set_parallelism(3).print()
print(env.get_execution_plan())

env.execute("custom-partition-demo")

{
  "nodes" : [ {
    "id" : 87,
    "type" : "Source: Collection Source",
    "pact" : "Data Source",
    "contents" : "Source: Collection Source",
    "parallelism" : 1
  }, {
    "id" : 88,
    "type" : "_stream_key_by_map_operator",
    "pact" : "Operator",
    "contents" : "_stream_key_by_map_operator",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 87,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 90,
    "type" : "_partition_custom_map_operator",
    "pact" : "Operator",
    "contents" : "_partition_custom_map_operator",
    "parallelism" : 1,
    "predecessors" : [ {
      "id" : 87,
      "ship_strategy" : "FORWARD",
      "side" : "second"
    } ]
  }, {
    "id" : 98,
    "type" : "_keyed_stream_values_operator, Map",
    "pact" : "Operator",
    "contents" : "_keyed_stream_values_operator, Map",
    "parallelism" : 3,
    "predecessors" : [ {
      "id" : 90,
      "ship_strategy" : "CUSTOM",
      "side" : "second"
    }

<pyflink.common.job_execution_result.JobExecutionResult at 0x7f93245434c0>