In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Reshape
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df):
    le = LabelEncoder()
    df['event_type_encoded'] = le.fit_transform(df['event_type'])
    df['agent_id_encoded'] = le.fit_transform(df['agent_id'])
    df['context_encoded'] = le.fit_transform(df['context'])
    
    vocabularies = {
        'event_type': df['event_type'].unique().tolist(),
        'agent_id': df['agent_id'].unique().tolist(),
        'context': df['context'].unique().tolist()
    }
    
    return df, vocabularies

def create_cnn_model(input_shape):
    inputs = Input(shape=input_shape)
    
    # Reshape input to add a channel dimension
    x = Reshape((input_shape[0], 1))(inputs)
    
    x = Conv1D(64, 2, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv1D(64, 2, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2, padding='same')(x)
    x = Dropout(0.25)(x)
    
    x = Conv1D(128, 2, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv1D(128, 2, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2, padding='same')(x)
    x = Dropout(0.25)(x)
    
    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
    return Model(inputs=inputs, outputs=x)

class HierarchicalModel:
    def __init__(self, input_shape, vocabularies, epochs=10):
        self.cnn = create_cnn_model(input_shape)
        self.vocabularies = vocabularies
        self.epochs = epochs
        
        self.event_tree = DecisionTreeClassifier(random_state=42)
        self.agent_tree = DecisionTreeClassifier(random_state=42)
        self.context_tree = DecisionTreeClassifier(random_state=42)
    
    def fit(self, X, y):
        # Compile and train CNN
        self.cnn.compile(optimizer=Adam(), loss='mse')
        self.cnn.fit(X, np.zeros((X.shape[0], 256)), epochs=self.epochs, verbose=1)  # dummy target with correct dimension
        
        cnn_features = self.cnn.predict(X)
        
        # Train event type tree
        self.event_tree.fit(cnn_features, y['event_type'])
        
        # Train agent ID tree
        event_pred = self.event_tree.predict_proba(cnn_features)
        agent_input = np.concatenate([cnn_features, event_pred], axis=1)
        self.agent_tree.fit(agent_input, y['agent_id'])
        
        # Train context tree
        agent_pred = self.agent_tree.predict_proba(agent_input)
        context_input = np.concatenate([cnn_features, event_pred, agent_pred], axis=1)
        self.context_tree.fit(context_input, y['context'])
    
    def predict_sequence(self, initial_input, sequence_length=5):
        predictions = []
        current_input = initial_input
        
        for step in range(sequence_length):
            print(f"Step {step+1} input: {current_input}")
            
            cnn_features = self.cnn.predict(current_input)
            print(f"Features: {cnn_features}")
            
            event_pred = self.event_tree.predict_proba(cnn_features)
            event_class = self.event_tree.predict(cnn_features)[0]
            
            agent_input = np.concatenate([cnn_features, event_pred], axis=1)
            agent_pred = self.agent_tree.predict_proba(agent_input)
            agent_class = self.agent_tree.predict(agent_input)[0]
            
            context_input = np.concatenate([cnn_features, event_pred, agent_pred], axis=1)
            context_class = self.context_tree.predict(context_input)[0]
            
            pred = {
                'event_type': self.vocabularies['event_type'][event_class],
                'agent_id': self.vocabularies['agent_id'][agent_class],
                'context': self.vocabularies['context'][context_class]
            }
            predictions.append(pred)
            
            # Update the input for the next prediction
            current_input = self.update_input(current_input, [self.vocabularies['event_type'][event_class], self.vocabularies['agent_id'][agent_class], self.vocabularies['context'][context_class]])
        
        return predictions
    
    def update_input(self, current_input, prediction):
        # Create a new input reflecting the updated state based on predictions
        new_input = np.zeros_like(current_input)
        new_input[0, :-3] = current_input[0, :-3]
        new_input[0, -3:] = [
            self.vocabularies['event_type'].index(prediction[0]),
            self.vocabularies['agent_id'].index(prediction[1]),
            self.vocabularies['context'].index(prediction[2])
        ]
        return new_input

def evaluate_predictions(predictions, y_true):
    if not predictions:
        return 0.0, 0.0, 0.0
    
    predicted_events = [p['event_type'] for p in predictions]
    predicted_agents = [p['agent_id'] for p in predictions]
    predicted_contexts = [p['context'] for p in predictions]
    
    true_events = [y_true['event_type'][0]] * len(predictions)
    true_agents = [y_true['agent_id'][0]] * len(predictions)
    true_contexts = [y_true['context'][0]] * len(predictions)
    
    event_accuracy = accuracy_score(true_events, predicted_events)
    agent_accuracy = accuracy_score(true_agents, predicted_agents)
    context_accuracy = accuracy_score(true_contexts, predicted_contexts)
    
    return event_accuracy, agent_accuracy, context_accuracy

def process_event_log(file_path, epochs=10):
    # Load and preprocess data
    df = load_data(file_path)
    processed_df, vocabularies = preprocess_data(df)
    
    # Prepare input and target variables
    X = processed_df[['event_type_encoded', 'agent_id_encoded', 'context_encoded']].values
    y = processed_df[['event_type_encoded', 'agent_id_encoded', 'context_encoded']].values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create y dictionaries for train and test sets
    y_train_dict = {
        'event_type': y_train[:, 0],
        'agent_id': y_train[:, 1],
        'context': y_train[:, 2]
    }
    y_test_dict = {
        'event_type': y_test[:, 0],
        'agent_id': y_test[:, 1],
        'context': y_test[:, 2]
    }
    
    # Create and train hierarchical model
    model = HierarchicalModel((3,), vocabularies, epochs=epochs)
    model.fit(X_train, y_train_dict)
    
    # Make predictions
    initial_input = X_test[0:1]
    predictions = model.predict_sequence(initial_input, sequence_length=5)
    
    # Evaluate predictions
    event_accuracy, agent_accuracy, context_accuracy = evaluate_predictions(predictions, y_test_dict)
    
    return predictions, X_test, y_test_dict, vocabularies, event_accuracy, agent_accuracy, context_accuracy, initial_input

# Run the pipeline
file_path = '10k_single_agent.csv'
epochs = 10
predictions, X_test, y_test, vocabularies, event_accuracy, agent_accuracy, context_accuracy, initial_input = process_event_log(file_path, epochs=epochs)

# Print the initial input
print("Initial input sequence:")
print(initial_input)

# Print the predictions
print("\nPredicted sequence:")
for i, pred in enumerate(predictions):
    print(f"Step {i+1}: {pred}")

# Print evaluation metrics
print("\nEvaluation Metrics:")
print(f"Event Type Accuracy: {event_accuracy:.2f}")
print(f"Agent ID Accuracy: {agent_accuracy:.2f}")
print(f"Context Accuracy: {context_accuracy:.2f}")


Epoch 1/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.7119
Epoch 2/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 2.5009e-05
Epoch 3/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 2.2617e-05
Epoch 4/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 4.8674e-05
Epoch 5/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 8.5418e-06
Epoch 6/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0000e+00
Epoch 7/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 1.0434e-09
Epoch 8/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 5.2173e-08
Epoch 9/10
[1m2407/2407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 3.1951e-10
Epoch 10/10
[1m2407/2407[0m [32m━━━━━━