In [0]:
pip install pyspark faker azure-storage-file-datalake

In [0]:
# Import modules and libraries
from azure.storage.filedatalake import DataLakeServiceClient
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from pyspark.sql.functions import col, lit, expr, rand, when
from faker import Faker
from datetime import datetime
from azure.storage.blob import BlobServiceClient
import random
import os

In [0]:
# Initialize Spark session
spark = SparkSession.builder.appName("ComprehensiveDataSets").getOrCreate()

# Define constants for categories, formulation types, and ingredients
categories = ['Foundation', 'Lipstick', 'Mascara', 'Eyeshadow', 'Blush']
formulation_types = ['Liquid', 'Powder', 'Cream', 'Gel', 'Stick']
primary_ingredients = ['Shea Butter', 'Hyaluronic Acid', 'Vitamin E', 'Collagen', 'Aloe Vera']
status_options = ['Completed', 'In Progress', 'Failed', 'Pending']

# ---------------------------------------------
# Generate Product Formulations Dataset (20 rows)
# ---------------------------------------------

product_formulations_df = (
    spark.range(20)
    .withColumn("ProductID", expr("uuid()"))
    .withColumn("ProductName", expr("concat(upper(substring(rand(), 3, 1)), ' ', element_at(array('Foundation', 'Lipstick', 'Mascara', 'Eyeshadow', 'Blush'), (cast(rand()*5+1 as int))) )"))
    .withColumn("Category", expr("element_at(array('Foundation', 'Lipstick', 'Mascara', 'Eyeshadow', 'Blush'), (cast(rand()*5+1 as int)))"))
    .withColumn("FormulationType", expr("element_at(array('Liquid', 'Powder', 'Cream', 'Gel', 'Stick'), (cast(rand()*5+1 as int)))"))
    .withColumn("PrimaryIngredients", expr("concat(element_at(array('Shea Butter', 'Hyaluronic Acid', 'Vitamin E', 'Collagen', 'Aloe Vera'), (cast(rand()*5+1 as int))), ', ', element_at(array('Shea Butter', 'Hyaluronic Acid', 'Vitamin E', 'Collagen', 'Aloe Vera'), (cast(rand()*5+1 as int))))"))
    .withColumn("LaunchDate", expr("date_sub(current_date(), cast(rand()*730 as int))"))  # Random date within the last 2 years
)

# ---------------------------------------------
# Generate Manufacturing Batch Dataset (50 rows)
# ---------------------------------------------

manufacturing_batch_df = (
    spark.range(50)
    .withColumn("BatchID", expr("uuid()"))
    .withColumn("ProductID", expr("uuid()"))  # Link to ProductID
    .withColumn("BatchDate", expr("date_sub(current_date(), cast(rand()*365 as int))"))  # Within 1 year
    .withColumn("Quantity", (rand() * 900 + 100).cast("int"))
    .withColumn("Status", expr("element_at(array('Completed', 'In Progress', 'Failed', 'Pending'), (cast(rand()*4+1 as int)))"))
)

# ---------------------------------------------
# Generate Customer Feedback Dataset (50 rows)
# ---------------------------------------------

customer_feedback_df = (
    spark.range(50)
    .withColumn("FeedbackID", expr("uuid()"))
    .withColumn("ProductID", expr("uuid()"))
    .withColumn("CustomerID", expr("uuid()"))
    .withColumn("Rating", (rand() * 4 + 1).cast("int"))
    .withColumn("Comments", lit("Feedback " + expr("cast(rand()*10000 as int)")))
    .withColumn("FeedbackDate", expr("date_sub(current_date(), cast(rand()*365 as int))"))
)

# ---------------------------------------------
# Generate Sales Dataset (50 rows)
# ---------------------------------------------

sales_df = (
    spark.range(50)
    .withColumn("OrderID", expr("uuid()"))
    .withColumn("CustomerID", expr("uuid()"))
    .withColumn("ProductID", expr("uuid()"))
    .withColumn("Quantity", (rand() * 9 + 1).cast("int"))
    .withColumn("TotalAmount", (rand() * 480 + 20).cast("float"))
    .withColumn("OrderDate", expr("date_sub(current_date(), cast(rand()*365 as int))"))
)

# ---------------------------------------------
# Generate Supplier Information Dataset (20 rows)
# ---------------------------------------------

supplier_df = (
    spark.range(20)
    .withColumn("SupplierID", expr("uuid()"))
    .withColumn("SupplierName", lit("Supplier_" + expr("cast(rand()*1000 as int)")))
    .withColumn("Material", expr("element_at(array('Shea Butter', 'Hyaluronic Acid', 'Vitamin E', 'Collagen', 'Aloe Vera'), (cast(rand()*5+1 as int)))"))
    .withColumn("Cost", (rand() * 90 + 10).cast("float"))
    .withColumn("DeliveryDate", expr("date_sub(current_date(), cast(rand()*365 as int))"))
)

# ---------------------------------------------
# Save DataFrames to CSV Files in Databricks
# ---------------------------------------------

output_path = "dbfs:/tmp/"

product_formulations_df.coalesce(1).write.csv(output_path + "product_formulations.csv", header=True, mode="overwrite")
manufacturing_batch_df.coalesce(1).write.csv(output_path + "manufacturing_batches.csv", header=True, mode="overwrite")
customer_feedback_df.coalesce(1).write.csv(output_path + "customer_feedback.csv", header=True, mode="overwrite")
sales_df.coalesce(1).write.csv(output_path + "sales_data.csv", header=True, mode="overwrite")
supplier_df.coalesce(1).write.csv(output_path + "supplier_information.csv", header=True, mode="overwrite")


In [0]:
import os
from datetime import datetime
from azure.storage.blob import BlobServiceClient

# ----------------------------------------------------------------------------------
# 1. AZURE STORAGE CONFIGURATION (USING SAS TOKEN)
# ----------------------------------------------------------------------------------

# Azure Storage connection details
account_name = "cdmo"
sas_token = "sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2025-01-30T10:43:36Z&st=2025-01-30T02:43:36Z&spr=https&sig=nYI7xcNkNktzxh0x0JaQKh0FYt9oPMMrKv%2Bkv6EFGTI%3D"  # Replace with your SAS token
container_name = "00-landing"

# Construct the BlobServiceClient URL
blob_service_url = f"https://{account_name}.blob.core.windows.net?{sas_token}"

# Initialize BlobServiceClient
blob_service_client = BlobServiceClient(account_url=blob_service_url)

# ----------------------------------------------------------------------------------
# 2. FUNCTION TO UPLOAD A FILE TO ADLS
# ----------------------------------------------------------------------------------

def upload_to_adls(local_path, blob_path):
    """
    Uploads a file to Azure Data Lake Storage using SAS Token authentication.

    Args:
        local_path (str): Local file path to upload.
        blob_path (str): Path in the ADLS container to upload the file to.
    """
    try:
        # Get the BlobClient for the target file in the specified container
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path)

        # Upload the file to ADLS
        with open(local_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
        print(f"✅ Uploaded file to ADLS: {blob_path}")
    except Exception as e:
        print(f"❌ Error uploading file to ADLS: {e}")

# ----------------------------------------------------------------------------------
# 3. DATASETS & FILE PROCESSING
# ----------------------------------------------------------------------------------

# Define datasets and their corresponding ADLS folders
datasets = {
    "customerfeedback": customer_feedback_df,
    "manufacturebatch": manufacturing_batch_df,
    "productformula": product_formulations_df,
    "sales": sales_df,
    "supplier": supplier_df,
}

# Get the current date and time for file versioning
current_date = datetime.now().strftime("%Y%m%d")
current_time = datetime.now().strftime("%H%M%S")

# Process each dataset and upload it to both incoming and archive paths
for dataset_name, dataframe in datasets.items():
    try:
        # Define temporary paths for saving and accessing data
        dbfs_temp_dir = f"dbfs:/tmp/{dataset_name}"  # Temporary DBFS directory for Spark output
        local_temp_dir = f"/dbfs/tmp/{dataset_name}"  # Local path to access DBFS files
        os.makedirs(local_temp_dir, exist_ok=True)

        # Save the DataFrame to DBFS as a single CSV file
        dataframe.coalesce(1).write.csv(dbfs_temp_dir, header=True, mode="overwrite")

        # Locate the part file in the DBFS directory
        files = dbutils.fs.ls(dbfs_temp_dir)
        part_file = next(f.path for f in files if f.name.startswith("part-"))

        # Rename the part file to a meaningful name with date and time
        local_file_path = os.path.join(local_temp_dir, f"{dataset_name}_{current_date}_{current_time}.csv")
        dbutils.fs.cp(part_file, f"file:{local_file_path}")

        # Define the blob paths in ADLS for both incoming and archive
        incoming_blob_path = f"data/incoming/{dataset_name}/{dataset_name}_{current_date}_{current_time}.csv"
        archive_blob_path = f"data/archive/{dataset_name}/{dataset_name}_{current_date}_{current_time}.csv"

        # Upload the file to both incoming and archive directories
        upload_to_adls(local_file_path, incoming_blob_path)
        upload_to_adls(local_file_path, archive_blob_path)

        # Clean up the temporary DBFS directory
        dbutils.fs.rm(dbfs_temp_dir, recurse=True)

        print(f"✅ Dataset '{dataset_name}' successfully uploaded to ADLS (incoming & archive).")
    except Exception as e:
        print(f"❌ Error processing dataset '{dataset_name}': {e}")


✅ Uploaded file to ADLS: data/incoming/customerfeedback/customerfeedback_20250130_044103.csv
✅ Uploaded file to ADLS: data/archive/customerfeedback/customerfeedback_20250130_044103.csv
✅ Dataset 'customerfeedback' successfully uploaded to ADLS (incoming & archive).
✅ Uploaded file to ADLS: data/incoming/manufacturebatch/manufacturebatch_20250130_044103.csv
✅ Uploaded file to ADLS: data/archive/manufacturebatch/manufacturebatch_20250130_044103.csv
✅ Dataset 'manufacturebatch' successfully uploaded to ADLS (incoming & archive).
✅ Uploaded file to ADLS: data/incoming/productformula/productformula_20250130_044103.csv
✅ Uploaded file to ADLS: data/archive/productformula/productformula_20250130_044103.csv
✅ Dataset 'productformula' successfully uploaded to ADLS (incoming & archive).
✅ Uploaded file to ADLS: data/incoming/sales/sales_20250130_044103.csv
✅ Uploaded file to ADLS: data/archive/sales/sales_20250130_044103.csv
✅ Dataset 'sales' successfully uploaded to ADLS (incoming & archive).
✅

In [0]:
# Validate files in landing zone were created
def get_adls_service_client(account_name, sas_token):
    """Authenticate using SAS token and return an ADLS service client."""
    account_url = f"https://{account_name}.dfs.core.windows.net"
    return DataLakeServiceClient(account_url, credential=sas_token)

def list_files_in_container(service_client, container_name):
    """List all files in the specified ADLS container."""
    file_system_client = service_client.get_file_system_client(file_system=container_name)
    
    print(f"Listing files in container: {container_name}")
    paths = file_system_client.get_paths()

    file_list = [path.name for path in paths if not path.is_directory]
    
    if file_list:
        for file in file_list:
            print(f"- {file}")
    else:
        print("No files found.")

    return file_list

# Connect and list files
try:
    adls_client = get_adls_service_client(account_name, sas_token)
    list_files_in_container(adls_client, "00-landing")
except Exception as e:
    print(f"Error: {e}")



Listing files in container: 00-landing
- data/archive/customerfeedback/customerfeedback_20250130_044103.csv
- data/archive/manufacturebatch/manufacturebatch_20250130_044103.csv
- data/archive/productformula/productformula_20250130_044103.csv
- data/archive/sales/sales_20250130_044103.csv
- data/archive/supplier/supplier_20250130_044103.csv
- data/incoming/customerfeedback/customerfeedback_20250130_044103.csv
- data/incoming/manufacturebatch/manufacturebatch_20250130_044103.csv
- data/incoming/productformula/productformula_20250130_044103.csv
- data/incoming/sales/sales_20250130_044103.csv
- data/incoming/supplier/supplier_20250130_044103.csv
