#### Setup tasks for Delta Lake demo

#### Import Python packages

In [0]:
import lakefs
from lakefs.client import Client
import os
from pyspark.sql.types import ByteType, IntegerType, LongType, StringType, StructType, StructField
from pyspark.sql.functions import *

#### Working with the lakeFS Python client API

###### Note: Learn more about [lakeFS Python integration](https://docs.lakefs.io/integrations/python.html)

In [0]:

clt = Client(
   host=lakefsEndPoint,
   username=lakefsAccessKey,
   password=lakefsSecretKey,
)

#### Verify lakeFS credentials

In [0]:
print("Verifying lakeFS credentials")
print(clt.version)
print("lakeFS credentials verified")

#### Define functions

In [0]:
def delta_table_compare_branches(table, refs):
  spark.createDataFrame(
    data=zip(
      refs,
      map(lambda r: spark.read.format('delta').load(f'lakefs://{repositoryName}/{r}/{table}').count(), refs)
    ), 
    schema=StructType([ 
      StructField("Branch", StringType(), True),
      StructField("Count", IntegerType(), True)
    ])
  ).display(truncate=False)

def print_diff(diff):
    results = map(
        lambda n:[n.path,n.path_type,n.size_bytes,n.type],
        diff)

    from tabulate import tabulate
    print(tabulate(
        results,
        headers=['Path','Path Type','Size(Bytes)','Type']))

#### For this demo - we'll be utilizing a dataset - [Orion Star - Sports and outdoors RDBMS dataset](https://www.kaggle.com/datasets/chethanp11/orion-star-sports-and-outdoors-rdbms-dataset) from [Kaggle](https://www.kaggle.com/).

#### Define schema for Customers table

In [0]:
customersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Country", StringType(), False),
  StructField("Gender", StringType(), False),
  StructField("Personal_ID", IntegerType(), True),
  StructField("Customer_Name", StringType(), False),
  StructField("Customer_FirstName", StringType(), False),
  StructField("Customer_LastName", StringType(), False),
  StructField("Birth_Date", StringType(), False),
  StructField("Customer_Address", StringType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Street_Number", IntegerType(), False),
  StructField("Customer_Type_ID", IntegerType(), False)
])

#### Define schema for Orders table

In [0]:
ordersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Employee_ID", IntegerType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Order_Date", StringType(), False),
  StructField("Delivery_Date", StringType(), False),
  StructField("Order_ID", LongType(), True),
  StructField("Order_Type", ByteType(), False),
  StructField("Product_ID", LongType(), False),
  StructField("Quantity", ByteType(), False),
  StructField("Total_Retail_Price", StringType(), False),
  StructField("CostPrice_Per_Unit", StringType(), False),
  StructField("Discount", LongType(), False)
])