## Configuration Parameters

In [None]:
# Number of transformers to simulate
NUM_TRANSFORMERS = 10

# Number of records to generate per batch
RECORDS_PER_BATCH = 50

# Stream interval in seconds (time between batches)
STREAM_INTERVAL = 5

## Import Required Libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import random
import string
import json
from datetime import datetime, timezone
import time

## Initialize Spark Session

In [None]:
spark = SparkSession.builder \
    .appName("TransformerStreamGenerator") \
    .getOrCreate()

## Helper Functions

In [None]:
def generate_transformer_id():
    """Generate an 8-character alphanumeric transformer ID"""
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=8))

def generate_transformer_ids(num_transformers):
    """Generate a list of unique transformer IDs"""
    return [generate_transformer_id() for _ in range(num_transformers)]

class TemperatureSimulator:
    """Simulate temperature readings with gradual changes"""
    def __init__(self, min_temp=60, max_temp=120):
        self.min_temp = min_temp
        self.max_temp = max_temp
        # Initialize with random temperature for each transformer
        self.current_temps = {}
    
    def get_temperature(self, transformer_id):
        """Get temperature for a transformer with gradual changes"""
        if transformer_id not in self.current_temps:
            # Initialize with a random temperature in the middle range
            self.current_temps[transformer_id] = random.randint(80, 100)
        
        # Make small random changes (-5 to +5 degrees)
        change = random.randint(-5, 5)
        new_temp = self.current_temps[transformer_id] + change
        
        # Keep within bounds using Python's built-in min/max
        if new_temp < self.min_temp:
            new_temp = self.min_temp
        elif new_temp > self.max_temp:
            new_temp = self.max_temp
            
        self.current_temps[transformer_id] = new_temp
        
        return new_temp

## Initialize Transformers and Temperature Simulator

In [None]:
# Generate transformer IDs
transformer_ids = generate_transformer_ids(NUM_TRANSFORMERS)
print(f"Generated {len(transformer_ids)} transformer IDs:")
for tid in transformer_ids:
    print(f"  - {tid}")

# Initialize temperature simulator
temp_simulator = TemperatureSimulator()

## Generate Stream Data

In [None]:
def generate_stream_batch(transformer_ids, temp_simulator, num_records):
    """Generate a batch of transformer readings"""
    records = []
    
    for _ in range(num_records):
        # Select a random transformer
        transformer_id = random.choice(transformer_ids)
        
        # Generate reading
        record = {
            "DateTime": datetime.now(timezone.utc).isoformat(),
            "TransformerID": transformer_id,
            "Temperature": temp_simulator.get_temperature(transformer_id),
            "Voltage": random.randint(220, 280)
        }
        records.append(record)
    
    return records

def stream_to_json(records):
    """Convert records to JSON string"""
    return json.dumps(records, indent=2)

## Generate Single Batch (Test)

Generate and display a single batch of records to verify the output format.

In [None]:
# Generate a test batch
test_batch = generate_stream_batch(transformer_ids, temp_simulator, 10)
print("Sample batch of 10 records:")
print(stream_to_json(test_batch))

## Create PySpark DataFrame from Batch

In [None]:
# Define schema
schema = StructType([
    StructField("DateTime", StringType(), False),
    StructField("TransformerID", StringType(), False),
    StructField("Temperature", IntegerType(), False),
    StructField("Voltage", IntegerType(), False)
])

# Create DataFrame from test batch
df = spark.createDataFrame(test_batch, schema=schema)
df.show(truncate=False)

## Continuous Stream Generation

Generate continuous batches of transformer data. This cell will run indefinitely until manually stopped.

In [None]:
# Continuous stream generation (run until stopped)
print(f"Starting continuous stream generation...")
print(f"Generating {RECORDS_PER_BATCH} records every {STREAM_INTERVAL} seconds")
print(f"Press 'Stop' to terminate\n")

batch_count = 0

try:
    while True:
        batch_count += 1
        
        # Generate batch
        batch = generate_stream_batch(transformer_ids, temp_simulator, RECORDS_PER_BATCH)
        
        # Create DataFrame
        df = spark.createDataFrame(batch, schema=schema)
        
        # Display summary
        print(f"\n=== Batch {batch_count} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')} ===")
        print(f"Records generated: {df.count()}")
        df.groupBy("TransformerID").agg(
            count("*").alias("RecordCount"),
            avg("Temperature").alias("AvgTemperature"),
            avg("Voltage").alias("AvgVoltage")
        ).show()
        
        # Optional: Show sample records
        print("Sample records:")
        df.show(5, truncate=False)
        
        # Wait before next batch
        time.sleep(STREAM_INTERVAL)
        
except KeyboardInterrupt:
    print(f"\nStream generation stopped. Total batches generated: {batch_count}")

## Export Single Batch to JSON File

Generate a single batch and save it to a JSON file for testing or integration purposes.

In [None]:
# Generate batch and save to file
output_batch = generate_stream_batch(transformer_ids, temp_simulator, RECORDS_PER_BATCH)
output_filename = f"transformer_stream_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"

with open(output_filename, 'w') as f:
    json.dump(output_batch, f, indent=2)

print(f"Batch exported to: {output_filename}")
print(f"Records: {len(output_batch)}")