In [0]:
pip install pyspark faker azure-storage-file-datalake

In [0]:
# Import modules and libraries
from azure.storage.filedatalake import DataLakeServiceClient
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from faker import Faker
from datetime import datetime
from azure.storage.blob import BlobServiceClient
import random
import os

In [0]:
# Initialize Spark session
spark = SparkSession.builder.appName("ComprehensiveDataSets").getOrCreate()

# Initialize Faker for generating mock data
fake = Faker()

# Define categories and formulation types for the dataset
categories = ['Foundation', 'Lipstick', 'Mascara', 'Eyeshadow', 'Blush']
formulation_types = ['Liquid', 'Powder', 'Cream', 'Gel', 'Stick']
primary_ingredients = ['Shea Butter', 'Hyaluronic Acid', 'Vitamin E', 'Collagen', 'Aloe Vera']
status_options = ['Completed', 'In Progress', 'Failed', 'Pending']

# ---------------------------------------------
# Generate Product Formulations Dataset
# ---------------------------------------------

# Create a list to store product data
product_data = []
product_ids = []  # To store unique ProductIDs for relationships

# Generate 20 unique product formulations
for _ in range(20):
    product_id = fake.uuid4()  # Unique identifier for each product
    product_ids.append(product_id)
    product = {
        "ProductID": product_id,
        "ProductName": fake.word().capitalize() + " " + random.choice(categories),
        "Category": random.choice(categories),
        "FormulationType": random.choice(formulation_types),
        "PrimaryIngredients": ', '.join(random.sample(primary_ingredients, 2)),
        "LaunchDate": fake.date_between(start_date="-2y", end_date="today").strftime("%Y-%m-%d")
    }
    product_data.append(product)

# Define schema for Product Formulations
product_schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("FormulationType", StringType(), True),
    StructField("PrimaryIngredients", StringType(), True),
    StructField("LaunchDate", StringType(), True)
])

# Create Product Formulations DataFrame
product_formulations_df = spark.createDataFrame(product_data, product_schema)

# ---------------------------------------------
# Generate Manufacturing Batch Dataset
# ---------------------------------------------

# Create a list to store batch data
batch_data = []

# Generate 50 manufacturing batches
for _ in range(50):
    batch = {
        "BatchID": fake.uuid4(),
        "ProductID": random.choice(product_ids),  # Associate with a product
        "BatchDate": fake.date_between(start_date="-1y", end_date="today").strftime("%Y-%m-%d"),
        "Quantity": random.randint(100, 1000),
        "Status": random.choice(status_options)
    }
    batch_data.append(batch)

# Define schema for Manufacturing Batches
batch_schema = StructType([
    StructField("BatchID", StringType(), True),
    StructField("ProductID", StringType(), True),
    StructField("BatchDate", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Status", StringType(), True)
])

# Create Manufacturing Batches DataFrame
manufacturing_batch_df = spark.createDataFrame(batch_data, batch_schema)

# ---------------------------------------------
# Generate Customer Feedback Dataset
# ---------------------------------------------

# Create a list to store customer feedback data
customer_feedback_data = []
customer_ids = [fake.uuid4() for _ in range(30)]  # Generate 30 unique Customer IDs

# Generate 50 customer feedback entries
for _ in range(50):
    feedback = {
        "FeedbackID": fake.uuid4(),
        "ProductID": random.choice(product_ids),
        "CustomerID": random.choice(customer_ids),
        "Rating": random.randint(1, 5),
        "Comments": fake.sentence(),
        "FeedbackDate": fake.date_between(start_date="-1y", end_date="today").strftime("%Y-%m-%d")
    }
    customer_feedback_data.append(feedback)

# Define schema for Customer Feedback
customer_feedback_schema = StructType([
    StructField("FeedbackID", StringType(), True),
    StructField("ProductID", StringType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Rating", IntegerType(), True),
    StructField("Comments", StringType(), True),
    StructField("FeedbackDate", StringType(), True)
])

# Create Customer Feedback DataFrame
customer_feedback_df = spark.createDataFrame(customer_feedback_data, customer_feedback_schema)

# ---------------------------------------------
# Generate Sales Dataset
# ---------------------------------------------

# Create a list to store sales data
sales_data = []

# Generate 50 sales transactions
for _ in range(50):
    sale = {
        "OrderID": fake.uuid4(),
        "CustomerID": random.choice(customer_ids),
        "ProductID": random.choice(product_ids),
        "Quantity": random.randint(1, 10),
        "TotalAmount": round(random.uniform(20, 500), 2),
        "OrderDate": fake.date_between(start_date="-1y", end_date="today").strftime("%Y-%m-%d")
    }
    sales_data.append(sale)

# Define schema for Sales Data
sales_schema = StructType([
    StructField("OrderID", StringType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("ProductID", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("TotalAmount", FloatType(), True),
    StructField("OrderDate", StringType(), True)
])

# Create Sales Data DataFrame
sales_df = spark.createDataFrame(sales_data, sales_schema)

# ---------------------------------------------
# Generate Supplier Information Dataset
# ---------------------------------------------

# Create a list to store supplier data
supplier_data = []
supplier_ids = [fake.uuid4() for _ in range(10)]  # Generate 10 unique Supplier IDs

# Generate 20 supplier records
for _ in range(20):
    supplier = {
        "SupplierID": random.choice(supplier_ids),
        "SupplierName": fake.company(),
        "Material": random.choice(primary_ingredients),
        "Cost": round(random.uniform(10, 100), 2),
        "DeliveryDate": fake.date_between(start_date="-1y", end_date="today").strftime("%Y-%m-%d")
    }
    supplier_data.append(supplier)

# Define schema for Supplier Information
supplier_schema = StructType([
    StructField("SupplierID", StringType(), True),
    StructField("SupplierName", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("Cost", FloatType(), True),
    StructField("DeliveryDate", StringType(), True)
])

# Create Supplier Information DataFrame
supplier_df = spark.createDataFrame(supplier_data, supplier_schema)

# ---------------------------------------------
# Save DataFrames to CSV Files
# ---------------------------------------------

# Define base path for saving files
output_path = "dbfs:/tmp/"

# Save each DataFrame to a CSV file
product_formulations_df.coalesce(1).write.csv(output_path + "product_formulations.csv", header=True, mode="overwrite")
manufacturing_batch_df.coalesce(1).write.csv(output_path + "manufacturing_batches.csv", header=True, mode="overwrite")
customer_feedback_df.coalesce(1).write.csv(output_path + "customer_feedback.csv", header=True, mode="overwrite")
sales_df.coalesce(1).write.csv(output_path + "sales_data.csv", header=True, mode="overwrite")
supplier_df.coalesce(1).write.csv(output_path + "supplier_information.csv", header=True, mode="overwrite")


In [0]:
# Azure Storage connection details (using SAS Token for secure access)
account_name = "cdmo"
sas_token = "sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2025-01-30T04:19:28Z&st=2025-01-29T20:19:28Z&spr=https&sig=ZhvYNtj9coHvI6vw0JWdayJ29mNbpoOq9d%2B4oR2zEwQ%3D"   # Replace with your SAS token
container_name = "00-landing"

# Construct the BlobServiceClient URL
blob_service_url = f"https://{account_name}.blob.core.windows.net?{sas_token}"

# Function to upload a file to Azure Data Lake Storage (ADLS)
def upload_to_adls(local_path, blob_path):
    """
    Uploads a file to Azure Data Lake Storage using SAS Token authentication.

    Args:
        local_path (str): Local file path to upload.
        blob_path (str): Path in the ADLS container to upload the file to.
    """
    try:
        # Initialize BlobServiceClient using the SAS token URL
        blob_service_client = BlobServiceClient(account_url=blob_service_url)

        # Get the BlobClient for the target file in the specified container
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path)

        # Upload the file to ADLS
        with open(local_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
        print(f"✅ Uploaded file to ADLS: {blob_path}")
    except Exception as e:
        print(f"❌ Error uploading file to ADLS: {e}")

# Define datasets and their corresponding ADLS folders
datasets = {
    "customerfeedback": customer_feedback_df,
    "manufacturebatch": manufacturing_batch_df,
    "productformula": product_formulations_df,
    "sales": sales_df,
    "supplier": supplier_df,
}

# Get the current date and time for file versioning
current_date = datetime.now().strftime("%Y%m%d")
current_time = datetime.now().strftime("%H%M%S")

# Save DataFrames to DBFS and upload to ADLS
for dataset_name, dataframe in datasets.items():
    try:
        # Define temporary paths for saving and accessing data
        dbfs_temp_dir = f"dbfs:/tmp/{dataset_name}"  # Temporary DBFS directory for Spark output
        local_temp_dir = f"/dbfs/tmp/{dataset_name}"  # Local path to access DBFS files
        os.makedirs(local_temp_dir, exist_ok=True)

        # Save the DataFrame to DBFS as a single CSV file
        dataframe.coalesce(1).write.csv(dbfs_temp_dir, header=True, mode="overwrite")

        # Locate the part file in the DBFS directory
        files = dbutils.fs.ls(dbfs_temp_dir)
        part_file = next(f.path for f in files if f.name.startswith("part-"))

        # Rename the part file to a meaningful name with date and time
        local_file_path = os.path.join(local_temp_dir, f"{dataset_name}_{current_date}_{current_time}.csv")
        dbutils.fs.cp(part_file, f"file:{local_file_path}")

        # Define the blob path in ADLS with the date folder and timestamped file name
        adls_blob_path = f"data/incoming/{dataset_name}/{dataset_name}_{current_date}_{current_time}.csv"

        # Upload the file to ADLS
        upload_to_adls(local_file_path, adls_blob_path)

        # Clean up the temporary DBFS directory
        dbutils.fs.rm(dbfs_temp_dir, recurse=True)

        print(f"✅ Dataset '{dataset_name}' successfully uploaded to ADLS as {adls_blob_path}.")
    except Exception as e:
        print(f"❌ Error processing dataset '{dataset_name}': {e}")


✅ Uploaded file to ADLS: data/incoming/customerfeedback/customerfeedback_20250129_205255.csv
✅ Dataset 'customerfeedback' successfully uploaded to ADLS as data/incoming/customerfeedback/customerfeedback_20250129_205255.csv.
✅ Uploaded file to ADLS: data/incoming/manufacturebatch/manufacturebatch_20250129_205255.csv
✅ Dataset 'manufacturebatch' successfully uploaded to ADLS as data/incoming/manufacturebatch/manufacturebatch_20250129_205255.csv.
✅ Uploaded file to ADLS: data/incoming/productformula/productformula_20250129_205255.csv
✅ Dataset 'productformula' successfully uploaded to ADLS as data/incoming/productformula/productformula_20250129_205255.csv.
✅ Uploaded file to ADLS: data/incoming/sales/sales_20250129_205255.csv
✅ Dataset 'sales' successfully uploaded to ADLS as data/incoming/sales/sales_20250129_205255.csv.
✅ Uploaded file to ADLS: data/incoming/supplier/supplier_20250129_205255.csv
✅ Dataset 'supplier' successfully uploaded to ADLS as data/incoming/supplier/supplier_202501

In [0]:
# Validate files in landing zone were created
def get_adls_service_client(account_name, sas_token):
    """Authenticate using SAS token and return an ADLS service client."""
    account_url = f"https://{account_name}.dfs.core.windows.net"
    return DataLakeServiceClient(account_url, credential=sas_token)

def list_files_in_container(service_client, container_name):
    """List all files in the specified ADLS container."""
    file_system_client = service_client.get_file_system_client(file_system=container_name)
    
    print(f"Listing files in container: {container_name}")
    paths = file_system_client.get_paths()

    file_list = [path.name for path in paths if not path.is_directory]
    
    if file_list:
        for file in file_list:
            print(f"- {file}")
    else:
        print("No files found.")

    return file_list

# Connect and list files
try:
    adls_client = get_adls_service_client(account_name, sas_token)
    list_files_in_container(adls_client, "00-landing")
except Exception as e:
    print(f"Error: {e}")



Listing files in container: 00-landing
- data/incoming/customerfeedback/customerfeedback_20250129_205255.csv
- data/incoming/manufacturebatch/manufacturebatch_20250129_205255.csv
- data/incoming/productformula/productformula_20250129_205255.csv
- data/incoming/sales/sales_20250129_205255.csv
- data/incoming/supplier/supplier_20250129_205255.csv
