In [0]:
!pip install kagglehub

In [0]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("asthamular/people-10-m")

print("Path to dataset files:", path)

In [0]:
init_sdf = spark.read.format('csv').options(header='true', inferSchema='true').load("file:/root/.cache/kagglehub/datasets/asthamular/people-10-m/versions/1")

In [0]:
init_sdf.writeTo('default.people_10m').createOrReplace()

In [0]:
# Using Delat lake python API 
from delta.tables import DeltaTable
DeltaTable.createIfNotExists(spark)\
  .tableName("default.people_10m_2")\
  .addColumn("id", "INT")\
  .addColumn("firstName", "STRING")\
  .addColumn("middleName", "STRING")\
  .addColumn("lastName", "STRING", comment = "surname")\
  .addColumn("gender", "STRING")\
  .addColumn("birthDate", "TIMESTAMP")\
  .addColumn("ssn", "STRING")\
  .addColumn("salary", "INT")\
  .execute()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import date

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", DateType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

data = [
  (9999998, 'Billy', 'Tommie', 'Luppitt', 'M', date.fromisoformat('1992-09-17'), '953-38-9452', 55250),
  (9999999, 'Elias', 'Cyril', 'Leadbetter', 'M', date.fromisoformat('1984-05-22'), '906-51-2137', 48500),
  (10000000, 'Joshua', 'Chas', 'Broggio', 'M', date.fromisoformat('1968-07-22'), '988-61-6247', 90000),
  (20000001, 'John', '', 'Doe', 'M', date.fromisoformat('1978-01-14'), '345-67-8901', 55500),
  (20000002, 'Mary', '', 'Smith', 'F', date.fromisoformat('1982-10-29'), '456-78-9012', 98250),
  (20000003, 'Jane', '', 'Doe', 'F', date.fromisoformat('1981-06-25'), '567-89-0123', 89900)
]

people_10m_updates = spark.createDataFrame(data, schema)
people_10m_updates.createTempView("people_10m_updates")

# ...

from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, 'default.people_10m')

(deltaTable.alias("people_10m")
  .merge(
    people_10m_updates.alias("people_10m_updates"),
    "people_10m.id = people_10m_updates.id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute()
)

In [0]:
df = spark.read.table("default.people_10m")
df_filtered = df.filter(df["id"] >= 9999998)
display(df_filtered)

Read and write to a delta table 

In [0]:
people_df = spark.read.table("default.people_10m")
display(people_df)

In [0]:
df.write.mode("append").saveAsTable("default.people_10m")

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")

# Declare the predicate by using a SQL-formatted string.
deltaTable.update(
  condition = "gender = 'F'",
  set = { "gender": "'Female'" }
)

# Declare the predicate by using Spark SQL functions.
deltaTable.update(
  condition = col('gender') == 'M',
  set = { 'gender': lit('Male') }
)

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
display(deltaTable.history())

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaHistory = deltaTable.history()

display(deltaHistory.where("version == 0"))
# Or:
# display(deltaHistory.where("timestamp == '2024-05-15T22:43:15.000+00:00'"))

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.optimize().executeCompaction()

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.optimize().executeZOrderBy("gender")

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.vacuum()