In [1]:
import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment

import get_env
env = get_env.get_remote_env()
t_env = StreamTableEnvironment.create(env)
t_env.get_config().set("parallelism.default", "1")

2025-12-02T03:17:32.782359Z main ERROR Reconfiguration failed: No configuration found for '63e31ee' at 'null' in 'null'
2025-12-02T03:17:33.888567Z Thread-3 ERROR Reconfiguration failed: No configuration found for '10be4640' at 'null' in 'null'


<pyflink.table.table_config.TableConfig at 0x7f13a35f7e50>

In [2]:

conf = t_env.get_config().get_configuration()
conf.set_string("fs.allowed-fallback-filesystems", "hadoop")
conf.set_string("fs.gs.project.id", "flink-demo-470113")
conf.set_string("fs.gs.auth.service.account.json.keyfile", "/etc/gcp/key.json")

<pyflink.common.configuration.Configuration at 0x7f13a2744640>

In [3]:

# Optional: set your project id (helps GCS connector)
# t_env.get_config().get_configuration().set_string(
#     "fs.gs.project.id", "your-gcp-project-id"
# )

# 1) Drop if exists
t_env.execute_sql("DROP TABLE IF EXISTS movies")

# 2) Register the CSV table on GCS
# NOTE: csv.ignore-parse-errors='true' will skip the header row
ddl = """
CREATE TABLE movies (
    movieId INT,
    title   STRING,
    genres  STRING
) WITH (
    'connector' = 'filesystem',
    'path'      = 'gs://gk2-datalake/bronze/movies/',
    'format'    = 'csv',
    'csv.ignore-parse-errors' = 'true',
    'csv.source.ignore-first-line' = 'true'
)
"""
t_env.execute_sql(ddl)

# 3) Get the table and print schema
movies = t_env.from_path("movies")
print("=== Schema ===")
movies.print_schema()

# 4) Print the data (bounded filesystem source -> will finish)
print("=== Data ===")
#result = movies.execute()
#result.print()

limited = movies.limit(5)
limited.execute().print()


=== Schema ===
(
  `movieId` INT,
  `title` STRING,
  `genres` STRING
)
=== Data ===
+----+-------------+--------------------------------+--------------------------------+
| op |     movieId |                          title |                         genres |
+----+-------------+--------------------------------+--------------------------------+
| +I |      <NULL> |                          title |                         genres |
| +I |           1 |               Toy Story (1995) | Adventure|Animation|Childre... |
| +I |           2 |                 Jumanji (1995) |     Adventure|Children|Fantasy |
| +I |           3 |        Grumpier Old Men (1995) |                 Comedy|Romance |
| +I |           4 |       Waiting to Exhale (1995) |           Comedy|Drama|Romance |
+----+-------------+--------------------------------+--------------------------------+
5 rows in set


In [4]:
t_env.execute_sql("DROP TABLE IF EXISTS movies_json")
# Sink table: filesystem JSON
ddl_sink = """
CREATE TABLE movies_json (
    movieId INT,
    title   STRING,
    genres  STRING
) WITH (
    'connector' = 'filesystem',
    'path'      = 'gs://gk2-datalake/bronze/movies-json/',
    'format'    = 'json'
)
"""
t_env.execute_sql(ddl_sink)

<pyflink.table.table_result.TableResult at 0x7f139efb5ea0>

In [5]:

# Insert while filtering out rows where movieId is NULL (this removes the header)
insert_sql = """
INSERT INTO movies_json
SELECT movieId, title, genres
FROM movies
WHERE movieId IS NOT NULL
"""
# execute the insert (returns a TableResult)
print("Starting job to write JSON to GCS...")
job_result = t_env.execute_sql(insert_sql)
print("Submitted. Job info:", job_result)


Starting job to write JSON to GCS...
Submitted. Job info: <pyflink.table.table_result.TableResult object at 0x7f13a35f4ee0>


In [6]:
# Drop temporary table safely
try:
    t_env.execute_sql("DROP TEMPORARY TABLE movies_json")
except Exception:
    pass   # ignore error if it doesn't exist
    
# Register the JSON table
t_env.execute_sql("""
CREATE TEMPORARY TABLE movies_json (
    movieId INT,
    title STRING,
    genres STRING
) WITH (
    'connector' = 'filesystem',
    'path' = 'gs://gk2-datalake/bronze/movies-json/',
    'format' = 'json',
    'json.ignore-parse-errors' = 'true'
)
""")


# 3) Get the table and print schema
movies = t_env.from_path("movies_json")
print("=== Schema ===")
movies.print_schema()

# 4) Print the data (bounded filesystem source -> will finish)
print("=== Data ===")
#result = movies.execute()
#result.print()

limited = movies.limit(5)
limited.execute().print()


=== Schema ===
=== Data ===
(
  `movieId` INT,
  `title` STRING,
  `genres` STRING
)
+----+-------------+--------------------------------+--------------------------------+
| op |     movieId |                          title |                         genres |
+----+-------------+--------------------------------+--------------------------------+
| +I |           1 |               Toy Story (1995) | Adventure|Animation|Childre... |
| +I |           2 |                 Jumanji (1995) |     Adventure|Children|Fantasy |
| +I |           3 |        Grumpier Old Men (1995) |                 Comedy|Romance |
| +I |           4 |       Waiting to Exhale (1995) |           Comedy|Drama|Romance |
| +I |           5 | Father of the Bride Part II... |                         Comedy |
+----+-------------+--------------------------------+--------------------------------+
5 rows in set


In [7]:
movies_sql = t_env.sql_query("SELECT * FROM movies_json LIMIT 5")
movies_sql.execute().print()

+----+-------------+--------------------------------+--------------------------------+
| op |     movieId |                          title |                         genres |
+----+-------------+--------------------------------+--------------------------------+
| +I |           1 |               Toy Story (1995) | Adventure|Animation|Childre... |
| +I |           2 |                 Jumanji (1995) |     Adventure|Children|Fantasy |
| +I |           3 |        Grumpier Old Men (1995) |                 Comedy|Romance |
| +I |           4 |       Waiting to Exhale (1995) |           Comedy|Drama|Romance |
| +I |           5 | Father of the Bride Part II... |                         Comedy |
+----+-------------+--------------------------------+--------------------------------+
5 rows in set


In [8]:

# Query 5 records
# Returns TableResult
result = t_env.execute_sql("SELECT * FROM movies_json LIMIT 5")

print 
it = result.collect()
for i, row in enumerate(it):
    print(row)            # prints Row(movieId=..., title='...', genres='...')

it.close()


<Row(1, 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy')>
<Row(2, 'Jumanji (1995)', 'Adventure|Children|Fantasy')>
<Row(3, 'Grumpier Old Men (1995)', 'Comedy|Romance')>
<Row(4, 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance')>
<Row(5, 'Father of the Bride Part II (1995)', 'Comedy')>


In [9]:
# upper case, where condition


movies_sql = t_env.sql_query("""
    SELECT 
        movieId,
        UPPER(title) AS title_upper,
        genres
    FROM movies_json
    WHERE genres LIKE '%Comedy%'
    LIMIT 5 
""")

movies_sql.execute().print()


+----+-------------+--------------------------------+--------------------------------+
| op |     movieId |                    title_upper |                         genres |
+----+-------------+--------------------------------+--------------------------------+
| +I |           1 |               TOY STORY (1995) | Adventure|Animation|Childre... |
| +I |           3 |        GRUMPIER OLD MEN (1995) |                 Comedy|Romance |
| +I |           4 |       WAITING TO EXHALE (1995) |           Comedy|Drama|Romance |
| +I |           5 | FATHER OF THE BRIDE PART II... |                         Comedy |
| +I |           7 |                 SABRINA (1995) |                 Comedy|Romance |
+----+-------------+--------------------------------+--------------------------------+
5 rows in set


In [10]:
# 3) Drop tables if they exist (safe pattern)
for q in [
    "DROP TEMPORARY TABLE IF EXISTS ratings_csv",    # note: IF EXISTS may error for TEMP in some versions; try/except below
    "DROP TEMPORARY TABLE IF EXISTS ratings_json"
]:
    try:
        t_env.execute_sql(q)
    except Exception:
        # ignore - some Flink builds disallow IF EXISTS for temporary tables
        pass

In [11]:
t_env.execute_sql("""
CREATE TEMPORARY TABLE ratings_csv (
  `userId` INT,
  `movieId` INT,
  `rating` DOUBLE,
  `timestamp` BIGINT
) WITH (
  'connector' = 'filesystem',
  'path' = 'gs://gk2-datalake/bronze/ratings/',
  'format' = 'csv',
  'csv.field-delimiter' = ',',
  'csv.ignore-parse-errors' = 'true',
  'csv.allow-comments' = 'false',
  'csv.first-line-as-header' = 'true'
)
""")


# 3) Get the table and print schema
ratings_csv = t_env.from_path("ratings_csv")
print("=== Schema ===")
ratings_csv.print_schema()

# 4) Print the data (bounded filesystem source -> will finish)
print("=== Data ===")

limited = ratings_csv.limit(5)
limited.execute().print()

=== Schema ===
=== Data ===
(
  `userId` INT,
  `movieId` INT,
  `rating` DOUBLE,
  `timestamp` BIGINT
)
+----+-------------+-------------+--------------------------------+----------------------+
| op |      userId |     movieId |                         rating |            timestamp |
+----+-------------+-------------+--------------------------------+----------------------+
| +I |      <NULL> |      <NULL> |                         <NULL> |               <NULL> |
| +I |           1 |           1 |                            4.0 |            964982703 |
| +I |           1 |           3 |                            4.0 |            964981247 |
| +I |           1 |           6 |                            4.0 |            964982224 |
| +I |           1 |          47 |                            5.0 |            964983815 |
+----+-------------+-------------+--------------------------------+----------------------+
5 rows in set


In [12]:
# 5) Create JSON sink table (newline-delimited JSON)
t_env.execute_sql("""
CREATE TEMPORARY TABLE ratings_json (
  `userId` INT,
  `movieId` INT,
  `rating` DOUBLE,
  `timestamp` BIGINT
) WITH (
  'connector' = 'filesystem',
  'path' = 'gs://gk2-datalake/bronze/ratings-json/',
  'format' = 'json',
  'json.ignore-parse-errors' = 'true'
)
""")

<pyflink.table.table_result.TableResult at 0x7f13a35f7f40>

In [13]:
from pyflink.table.expressions import col, to_timestamp_ltz, lit
from pyflink.table import DataTypes

ratings = t_env.from_path("ratings_csv")

transformed = (
    ratings
      # <-- IMPORTANT: do NOT call is_not_null(), use the Expression object
      .filter(col("userId").is_not_null)

      .add_columns(
          to_timestamp_ltz(
              # cast to STRING first, trim whitespace, then cast to BIGINT, multiply by 1000
              (col("timestamp").cast(DataTypes.STRING()).trim().cast(DataTypes.BIGINT()) * lit(1000)),
              3
          ).alias("rate_date")
      )

      # .drop_columns(col("timestamp"))  

      # explicitly select columns to exclude userId
      .select(
          col("movieId"),
          col("rating"),
          # col("timestamp"),
          col("rate_date")   # alias created above
      )
)

transformed.limit(5).execute().print()


+----+-------------+--------------------------------+-------------------------+
| op |     movieId |                         rating |               rate_date |
+----+-------------+--------------------------------+-------------------------+
| +I |           1 |                            4.0 | 2000-07-30 18:45:03.000 |
| +I |           3 |                            4.0 | 2000-07-30 18:20:47.000 |
| +I |           6 |                            4.0 | 2000-07-30 18:37:04.000 |
| +I |          47 |                            5.0 | 2000-07-30 19:03:35.000 |
| +I |          50 |                            5.0 | 2000-07-30 18:48:51.000 |
+----+-------------+--------------------------------+-------------------------+
5 rows in set


In [14]:
filtered = transformed.filter(col("rating") == 0.5)
filtered.limit(5).execute().print()

+----+-------------+--------------------------------+-------------------------+
| op |     movieId |                         rating |               rate_date |
+----+-------------+--------------------------------+-------------------------+
| +I |          31 |                            0.5 | 2011-05-27 02:32:58.000 |
| +I |         527 |                            0.5 | 2011-05-27 02:44:35.000 |
| +I |         647 |                            0.5 | 2011-05-27 02:33:39.000 |
| +I |         688 |                            0.5 | 2011-05-27 02:43:48.000 |
| +I |         720 |                            0.5 | 2011-05-27 02:33:15.000 |
+----+-------------+--------------------------------+-------------------------+
5 rows in set


In [15]:
# View, View is not materilized, temp table is materialized
try:
    t_env.execute_sql("DROP TEMPORARY VIEW transformed_view")
except:
    pass

try:
    t_env.execute_sql("DROP TEMPORARY VIEW filtered_view")
except:
    pass

In [16]:
# safe drop (ignore errors if not present)
for v in ("transformed_view", "filtered_view"):
    try:
        t_env.execute_sql(f"DROP TEMPORARY VIEW {v}")
    except Exception:
        pass

# create transformed_view using quoted identifier for the timestamp column
t_env.execute_sql("""
CREATE TEMPORARY VIEW transformed_view AS
SELECT
  movieId,
  rating,
  TO_TIMESTAMP_LTZ(
      CAST(TRIM(CAST(`timestamp` AS STRING)) AS BIGINT) * 1000,
      3
  ) AS rate_date
FROM ratings_csv
WHERE `userId` IS NOT NULL
""")

# preview 5 rows
t_env.sql_query("SELECT * FROM transformed_view LIMIT 5").execute().print()


+----+-------------+--------------------------------+-------------------------+
| op |     movieId |                         rating |               rate_date |
+----+-------------+--------------------------------+-------------------------+
| +I |           1 |                            4.0 | 2000-07-30 18:45:03.000 |
| +I |           3 |                            4.0 | 2000-07-30 18:20:47.000 |
| +I |           6 |                            4.0 | 2000-07-30 18:37:04.000 |
| +I |          47 |                            5.0 | 2000-07-30 19:03:35.000 |
| +I |          50 |                            5.0 | 2000-07-30 18:48:51.000 |
+----+-------------+--------------------------------+-------------------------+
5 rows in set


In [17]:
# Safe create + insert into partitioned JSON sink (copy-paste)
from pyflink.table import TableResult

# 1) safe drop if exists
try:
    t_env.execute_sql("DROP TABLE IF EXISTS movies_partitioned_json")
except Exception:
    pass

# use backtick ` for reserved column name 

# 2) create partitioned table (quote potentially reserved names with backticks)
t_env.execute_sql("""
CREATE TABLE movies_partitioned_json (
  movieId INT,
  rating DOUBLE,
  rate_date TIMESTAMP(3),
  `year` INT,
  `month` INT,
  `day` INT
) PARTITIONED BY (`year`, `month`, `day`)
WITH (
  'connector' = 'filesystem',
  'path' = 'gs://gk2-datalake/bronze/ratings-json-partition/',
  'format' = 'json',
  'json.ignore-parse-errors' = 'true',
  -- optional partition commit (may be unsupported on some builds; remove if parser complains)
  'sink.partition-commit.policy.kind' = 'success-file',
  'sink.partition-commit.delay' = '0s',
  'sink.partition-commit.trigger' = 'partition-time'
)
""")



<pyflink.table.table_result.TableResult at 0x7f139efb7760>

In [18]:

# cast output types to match sink schema: TIMESTAMP(3) and INT for partitions
insert_stmt = """
INSERT INTO movies_partitioned_json
SELECT
  movieId,
  rating,
  CAST(rate_date AS TIMESTAMP(3))                      AS rate_date,
  CAST(EXTRACT(YEAR FROM rate_date)  AS INT)           AS `year`,
  CAST(EXTRACT(MONTH FROM rate_date) AS INT)           AS `month`,
  CAST(EXTRACT(DAY FROM rate_date)   AS INT)           AS `day`
FROM transformed_view
"""
print("Submitting INSERT job (with casts)...")
insert_result = t_env.execute_sql(insert_stmt)
print("Insert submitted:", insert_result)



Submitting INSERT job (with casts)...
Insert submitted: <pyflink.table.table_result.TableResult object at 0x7f139efb5a50>


In [19]:
job_id = insert_result.get_job_client().get_job_id()
print ("job", job_id)

status = insert_result.get_job_client().get_job_status().result()
 
# keep checking until FINISHED
print("status:", status, str(status))


job 0f5facf172ea17017e22eaf948f24f5e
status: JobStatus.RUNNING JobStatus.RUNNING


In [20]:

# 4) quick verify (may return results only after job finishes and files are visible)
try:
    t_env.sql_query("SELECT movieId, rating, rate_date, `year`, `month`, `day` FROM movies_partitioned_json LIMIT 5").execute().print()
except Exception as e:
    print("Verify query error (ok if job still running):", e)
    print("Check GCS path: gs://gk2-datalake/bronze/ratings-json-partition/")


Verify query error (ok if job still running): org.apache.flink.table.api.TableException: Fetch partitions fail.
	at org.apache.flink.connector.file.table.FileSystemTableSource.listPartitions(FileSystemTableSource.java:346)
	at org.apache.flink.connector.file.table.FileSystemTableSource.getOrFetchPartitions(FileSystemTableSource.java:437)
	at org.apache.flink.connector.file.table.FileSystemTableSource.getScanRuntimeProvider(FileSystemTableSource.java:130)
	at org.apache.flink.table.planner.plan.nodes.exec.common.CommonExecTableSourceScan.translateToPlanInternal(CommonExecTableSourceScan.java:121)
	at org.apache.flink.table.planner.plan.nodes.exec.ExecNodeBase.translateToPlan(ExecNodeBase.java:168)
	at org.apache.flink.table.planner.plan.nodes.exec.ExecEdge.translateToPlan(ExecEdge.java:259)
	at org.apache.flink.table.planner.plan.nodes.exec.stream.StreamExecExchange.translateToPlanInternal(StreamExecExchange.java:99)
	at org.apache.flink.table.planner.plan.nodes.exec.ExecNodeBase.transl