In [2]:
pip install river


Collecting river
  Obtaining dependency information for river from https://files.pythonhosted.org/packages/78/f8/1ba78654c91c342a33f774747e4f9d1743447033c7ca88c5ecd87e63f678/river-0.21.0-cp311-cp311-win_amd64.whl.metadata
  Downloading river-0.21.0-cp311-cp311-win_amd64.whl.metadata (9.0 kB)
Collecting pandas<3.0,>=2.1 (from river)
  Obtaining dependency information for pandas<3.0,>=2.1 from https://files.pythonhosted.org/packages/11/17/fb1a34f3e73debbc2fd15a01ea17eaab3717943d08463ff4979a4f024b3f/pandas-2.1.4-cp311-cp311-win_amd64.whl.metadata
  Downloading pandas-2.1.4-cp311-cp311-win_amd64.whl.metadata (18 kB)
Downloading river-0.21.0-cp311-cp311-win_amd64.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB ? eta -:--:--
   - -------------------------------------- 

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from river.drift import KSWIN

In [4]:
# Load dataset
def load_dataset(file_path):
    return pd.read_csv(file_path)

In [5]:
# Preprocess dataset
def preprocess_data(data, columns_to_scale):
    data = data.fillna(method='ffill')  # Fill missing values
    scaler = StandardScaler()
    data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])
    return data

In [12]:
# Detect change using KSWIN from the river package
def detect_change_with_kswin(data, columns_to_monitor):
    results = {}
    for column in columns_to_monitor:
        kswin = KSWIN(alpha=0.001)  # Adjust alpha as needed
        results[column] = []

        for i, value in enumerate(data[column]):
            in_drift = kswin.update(value)  # Update the method and get drift status
            
            if in_drift:
                results[column].append(i)

    return results


In [13]:
# Main function
def main():
    file_path = '../../data_loading/hai-23_05/hai-test1.csv'  # Replace with your dataset path
    data = load_dataset(file_path)

    # Select columns to preprocess and monitor
    columns_to_scale_and_monitor = ['P1_FCV01D', 'P1_PIT01', 'P2_VIBTR01', 'x1001_05_SETPOINT_OUT']

    preprocessed_data = preprocess_data(data, columns_to_scale_and_monitor)
    change_points = detect_change_with_kswin(preprocessed_data, columns_to_scale_and_monitor)

    # Display the results
    for column, points in change_points.items():
        if points:
            print(f"Changes detected in {column} at indices: {points}")
        else:
            print(f"No significant changes detected in {column}.")

if __name__ == "__main__":
    main()

No significant changes detected in P1_FCV01D.
No significant changes detected in P1_PIT01.
No significant changes detected in P2_VIBTR01.
No significant changes detected in x1001_05_SETPOINT_OUT.


In [None]:
# END ALBI NEW TEST

In [38]:
# Configure Kafka consumer
kafka_bootstrap_servers = 'localhost:9092'
kafka_topic = 'hai-input'

In [39]:
consumer = KafkaConsumer(kafka_topic,
                          bootstrap_servers=kafka_bootstrap_servers, 
                          auto_offset_reset='earliest', 
                          group_id='kswin-group',
                          enable_auto_commit=True,
                          value_deserializer=lambda x: loads(x.decode('utf-8')))


In [40]:
# Initialize KSWIN detector
window_size = 100
alpha = 0.01
data_window = []

In [41]:
# Function to process each batch of streaming data
def process_batch(batch_df, epoch_id):
    global data_window
    data = batch_df.select("value").rdd.map(lambda row: float(row.value)).collect()

    # Append the data to the global window
    data_window += data

    # Keep the window size constant
    if len(data_window) > window_size:
        data_window = data_window[-window_size:]

    # Perform KSWIN test
    ks_statistic, _ = ks_2samp(data_window, np.random.normal(size=window_size))

    # Check for change
    if ks_statistic > alpha:
        print(f"Change detected in batch {epoch_id}!")

In [44]:
# Read streaming data from Kafka
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", ['localhost:9092']) \
    .option("subscribe", kafka_topic) \
    .load()

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.