# Use Case: Reprocess and Backfill Data with new ETL logic

## Import Python packages

In [None]:
import datetime
import lakefs_client
import os

## Working with the lakeFS Python client API

In [None]:
%xmode Minimal
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

## Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials")
client.config.get_lake_fs_version()
print("lakeFS credentials verified")

## S3A Gateway configuration

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Define data file schema

In [None]:
from pyspark.sql.types import DoubleType, StructType, StructField

dataFileSchema = StructType([
  StructField("Apparel_Sales", DoubleType(), False),
  StructField("Books_Sales", DoubleType(), False),
  StructField("Electronics_Sales", DoubleType(), False),
  StructField("Furniture_Sales", DoubleType(), False),
  StructField("Toys_Sales", DoubleType(), False)
])

In [None]:
processedDataFileSchema = StructType([
  StructField("Apparel_Sales", DoubleType(), False),
  StructField("Books_Sales", DoubleType(), False),
  StructField("Electronics_Sales", DoubleType(), False),
  StructField("Furniture_Sales", DoubleType(), False),
  StructField("Toys_Sales", DoubleType(), False),
  StructField("Total_Sales", DoubleType(), False),
  StructField("Average_Sales_per_Product_Category", DoubleType(), False)
])