In [1]:
from pyspark.sql import SparkSession

catalog_name = "rainbow-data-production-iceberg"
project_id = "rainbow-data-production-483609"
spark = (
    SparkSession.builder.appName("Spark-Iceberg")
    .config("spark.master", "local[*]")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.port", "7078")
    .config("spark.blockManager.port", "7079")
    .config("spark.driver.memory", "2G")
    .config("spark.executor.memory", "2G")
    .config("spark.driver.userClassPathFirst", "false")
    .config("spark.executor.userClassPathFirst", "false")
    .config(
        f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog"
    )
    .config(f"spark.sql.catalog.{catalog_name}.type", "rest")
    .config(
        f"spark.sql.catalog.{catalog_name}.uri",
        "https://biglake.googleapis.com/iceberg/v1/restcatalog",
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.warehouse",
        f"bq://projects/{project_id}",
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.header.x-goog-user-project",
        project_id,
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.rest.auth.type",
        "org.apache.iceberg.gcp.auth.GoogleAuthManager",
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.io-impl",
        "org.apache.iceberg.gcp.gcs.GCSFileIO",
    )
    .config(f"spark.sql.catalog.{catalog_name}.rest-metrics-reporting-enabled", "false")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .config("spark.sql.defaultCatalog", catalog_name)
    .config(
        "spark.jars.packages",
        ",".join(
            [
                "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.1",
                "org.apache.iceberg:iceberg-gcp-bundle:1.10.1",
                # "com.google.auth:google-auth-library-oauth2-http:1.41.0",
                # "com.google.auth:google-auth-library-credentials:1.41.0",
                # "com.google.guava:guava:32.1.2-jre",
                # "com.google.cloud:google-cloud-storage:2.61.0",
                # "com.google.cloud:libraries-bom:26.73.0",
            ]
        ),
    )
    .getOrCreate()
)


:: loading settings :: url = jar:file:/Users/tuan.tran1/Workspaces/Test/TestIceberg/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/tuan.tran1/.ivy2/cache
The jars for the packages stored in: /Users/tuan.tran1/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-gcp-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f1482565-cc86-4b60-b400-9fac21e9da57;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.10.1 in central
	found org.apache.iceberg#iceberg-gcp-bundle;1.10.1 in central
:: resolution report :: resolve 115ms :: artifacts dl 17ms
	:: modules in use:
	org.apache.iceberg#iceberg-gcp-bundle;1.10.1 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.10.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	----------------------------------

In [16]:
spark.sql("SHOW CATALOGS").show()

+--------------------+
|             catalog|
+--------------------+
|rainbow-data-prod...|
|       spark_catalog|
+--------------------+



In [17]:
spark.sql("SELECT current_catalog();").show()

+--------------------+
|   current_catalog()|
+--------------------+
|rainbow-data-prod...|
+--------------------+



In [18]:
# spark.sql("CREATE NAMESPACE IF NOT EXISTS test_namespace_cv ;")
spark.sql("CREATE NAMESPACE IF NOT EXISTS test_namespace_cv LOCATION 'gs://rainbow-data-production-iceberg/test_namespace_cv' WITH DBPROPERTIES ('gcp-region' = 'us-central1');")

DataFrame[]

In [19]:
spark.sql("SHOW NAMESPACES").show()

+-----------------+
|        namespace|
+-----------------+
|             test|
|   test_namespace|
|  test_namespace1|
|test_namespace_cv|
+-----------------+



In [20]:
spark.sql("USE test_namespace_cv;")

DataFrame[]

In [21]:
spark.sql("USE test_namespace_cv;")

DataFrame[]

In [22]:
spark.sql("CREATE TABLE IF NOT EXISTS sample_table (id BIGINT, data STRING) USING ICEBERG;")

DataFrame[]

In [23]:
spark.sql("SHOW TABLES").show()

+-----------------+------------+-----------+
|        namespace|   tableName|isTemporary|
+-----------------+------------+-----------+
|test_namespace_cv|sample_table|      false|
+-----------------+------------+-----------+



In [24]:
spark.sql("""
INSERT INTO sample_table VALUES
  (1, 'first'), (2, 'second'), (3, 'third')
""")

                                                                                

DataFrame[]

In [25]:
spark.sql("SELECT * FROM sample_table").show()

[Stage 8:>                                                          (0 + 1) / 1]

+---+------+
| id|  data|
+---+------+
|  1| first|
|  2|second|
|  3| third|
+---+------+



                                                                                

In [12]:
spark.sql("""
INSERT INTO sample_table VALUES
  (4, 'fourth'), (5, 'fifth'), (6, 'sixth');
""")

                                                                                

DataFrame[]

In [13]:
spark.sql("SELECT * FROM sample_table;").show()

[Stage 5:>                                                          (0 + 1) / 1]

+---+------+
| id|  data|
+---+------+
|  1| first|
|  2|second|
|  3| third|
|  4|fourth|
|  5| fifth|
|  6| sixth|
+---+------+



                                                                                