In [None]:
# Common code

import re
import pickle
import sqlite3

class LogFile:
    __path = None
    __hostname_pattern = re.compile('^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$')
    __id_suffix_pattern = re.compile('^([a-z]+-)+[a-f0-9]+-.{5}$') # something-and-else-fb6afe71-ea3fd
    __id_prefix_pattern1 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}-(.*)$') # fb6afe71-6f53-48dd-920f-743ab83e1c2f-something
    __id_prefix_pattern2 = re.compile('^[a-f0-9]{32}-(.*)$') # 32-digit hex number prefix
    __id_suffix_pattern3 = re.compile('^([a-z]+-)+[a-f0-9]{5}$') # something-else-ea3fd
    __id_suffix_pattern4 = re.compile('^([a-z]+-)+[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # something-16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __id_token_pattern1 = re.compile('^[a-z-]+-') # just words separated by dashes
    __id_token_pattern2 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # 16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __numeric_token_pattern = re.compile('^[+]?[0-9]+[.]*[0-9]*[mug]*i*[sb]*$') # +4.653ms and such
    __hex_num_pattern = re.compile('[a-z0-9]+')
    __ttable = str.maketrans('{}[]()=:/\\,&?_', '              ', '"\'')
    __db = None
    __dbc = None
    
    def __init__(self, path = None, db_path = None, clear = True):
        if db_path is None:
            self.__db = sqlite3.connect(":memory:")
        else:
            self.__db = sqlite3.connect(db_path)
        self.__dbc = self.__db.cursor()
        self.__dbc.execute("create table if not exists log_lines(ID integer primary key, line text not NULL)")
        self.__dbc.execute("create table if not exists tokens(ID integer primary key, token text unique not NULL, freq integer not null default 1)")
        self.__dbc.execute("create table if not exists identifiers(ID integer primary key, identifier text unique not NULL)")
        self.__dbc.execute("create table if not exists log_line_tokens(line_ID integer not null, token_ID integer not null, primary key (line_ID, token_ID))")
        self.__dbc.execute("create table if not exists log_line_identifiers(line_ID integer not null, identifier_ID integer not null, primary key (line_ID, identifier_ID))")
        if clear:
            self.__dbc.execute("delete from log_lines") # this should make the class idempotent but allows only for one log file to be processed
        self.set_path(path)

    def __fini__(self):
        self.__db.close()

    def __is_hostname(self, s):
        return s.count('.') > 0 and self.__hostname_pattern.match(s)

    def __is_hash(self, s):
        if (len(s) == 32 or len(s) == 64) and self.__hex_num_pattern.match(s):
            return True
        if self.__id_token_pattern2.match(s):
            return True
        return False

    def __is_numberlike(self, s):
        if s.isnumeric() or self.__numeric_token_pattern.match(s):
            return True
        return False

    # 'some-operator-69854dc866-sbpwz' > 'some-operator-id_suffix'
    def __replace_id_suffix(self, s):
        m = None
        if s.startswith("--"):
            return(s, False)
        if self.__id_suffix_pattern.match(s):
            m = self.__id_token_pattern1.match(s)
        if m != None:
            return (m.group() + 'id_suffix', True)
        m = self.__id_suffix_pattern4.match(s)
        if m is not None:
            return (m.group(1) + 'id_suffix', True)
        return (s, False)

    def __replace_id_prefix(self, s):
        m = self.__id_prefix_pattern1.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        m = self.__id_prefix_pattern2.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        return (s, False)
        
    def __log_line_tokenize(self, line):
        ret = []
        tokens = line.split(" ", 8)
        # Filter out things that can't be possibly kubernetes and even the k8s startup log lines
        if not (tokens[4].startswith("kubenswrapper") and (tokens[5].startswith("i") or tokens[5].startswith("w") or tokens[5].startswith("e"))):
             raise IndexError
        identifiers = []
        log_text = tokens.pop(-1)
        log_text = log_text.translate(self.__ttable)
        log_text_tokens = log_text.split()
        i = 0
        for t in log_text_tokens:
            log_text_tokens[i] = t.strip(". ?!*#,+")
            i = i + 1
        tokens = [tokens[5]]
        tokens[0] = tokens[0][0] + '000'
        tokens = tokens + log_text_tokens
        for t in tokens:
            tr = t
            if self.__is_numberlike(t):
                continue
            if self.__is_hostname(t):
                tr = "_hostname_token"
            elif self.__is_hash(t):
                if not t in identifiers:
                    identifiers.append(t)
                tr = "_hash_token"
            else:
                (tr, found) = self.__replace_id_prefix(t)
                if found and not t in identifiers:
                    identifiers.append(t)
                (tr, found) = self.__replace_id_suffix(tr)
                if found and not t in identifiers:
                    identifiers.append(t)
            ret.append(tr)
        return (ret, identifiers)
    
    def set_path(self, path):
        self.__path = path

    def parse_log(self, limit = 0):
        logfile = open(self.__path, "r")
        line = logfile.readline().casefold()
        line_num = 1 # this will become line_ID
        while (len(line) > 0 and limit == 0) or (len(line) > 0 and limit != 0 and line_num <= limit):
            try:
                tokens, identifiers = self.__log_line_tokenize(line)
                self.__dbc.execute("insert into log_lines (ID, line) values (?, ?)", (line_num, line))
                l_id = self.__dbc.lastrowid
                for t in tokens:
                    if t.isspace():
                        continue
                    self.__dbc.execute("insert into tokens (token) values (?) on conflict do update set freq = freq + 1", (t,))
                    t_id = self.__dbc.execute("select id from tokens where (token = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_tokens (line_ID, token_ID) values (?,?)", (l_id, t_id))
                for t in identifiers:
                    self.__dbc.execute("insert or ignore into identifiers (identifier) values (?)", (t,))
                    i_id = self.__dbc.execute("select id from identifiers where (identifier = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_identifiers (line_ID, identifier_ID) values (?,?)", (l_id, i_id))
            except IndexError:
                pass
            line = logfile.readline().casefold()
            line_num = line_num + 1
        self.__db.commit()
        logfile.close()

    def export_freq_table(self, filename):
        with open(filename, "w") as fw:
            for r in self.__dbc.execute("select token, freq from tokens"):
                fw.write(f"\"{r[0]}\", {r[1]}\n")

    def export_words_table(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"attach database '{filename}' as words_db")
        self.__dbc.execute("create table if not exists words_db.words (id integer primary key, word text unique not null)")
        self.__dbc.execute("insert or ignore into words_db.words (word) select token from tokens")
        self.__db.commit()
    
    def import_words_table(self, filename):
        r = self.__dbc.execute("create table if not exists words (id integer primary key, word text unique not null)")
        self.__dbc.execute("delete from words")
        with open(filename, "r") as fr:
            for line in fr:
                self.__dbc.execute("insert or ignore into words (word) values (?)",(line.rstrip(),))
        self.__db.commit()

    def get_words_count(self):
        r = self.__dbc.execute("select count(*) from words")
        return r.fetchone()[0]

    def save_db(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"vacuum into '{filename}'")

    def db_handle(self):
        return self.__db

In [None]:
# Build the raw dictionary with token counts
#logfile = LogFile("journal-aws-efs-operator-e2e.0", 'efs-e2e.db')
logfile = LogFile("journal-aws-efs-operator-e2e.0")
logfile.parse_log()

In [None]:
con = logfile.db_handle()
cur = con.cursor()
r = cur.execute("select count(ID) from log_lines")
print(r.fetchone()[0])
for r in cur.execute("select * from log_lines limit 20"):
    print(r)

In [None]:
r = cur.execute("select count(ID) from tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from tokens order by freq desc limit 20"):
    print(r)

In [None]:
logfile.export_freq_table("frequencies.csv")

In [None]:
r = cur.execute("select count(id) from identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from identifiers"):
    print(r)

In [None]:
r = cur.execute("select count(line_ID) from log_line_identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_identifiers order by line_ID limit 20"):
    print(r)

In [None]:
r = cur.execute("select count(line_ID) from log_line_tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_tokens order by token_ID limit 20"):
    print(r)


In [None]:
logfile.export_words_table('words.db')
r = cur.execute("create table words (id integer primary key, word text unique not null)")

# Here: review the db / words table and export as csv to be imported as the label set

In [None]:
import torch
import torch.nn as nn

logfile = LogFile("journal-aws-efs-operator-e2e.0", 'test2.db')
logfile.import_words_table('words.csv')
logfile.parse_log()

con = logfile.db_handle()
cur = con.cursor()

words_num = logfile.get_words_count()
print(f"Dictitonary size: {words_num} words")

prev_identifier_id = 0
prev_line_id = 0
raw_dataset = []
line_words = torch.zeros(words_num)
raw_dataset.append(line_words)
for r in cur.execute("select words.id as word_id, log_line_identifiers.identifier_ID, log_line_identifiers.line_ID from words \
        join tokens on tokens.token = words.word \
            join log_line_tokens on log_line_tokens.token_ID = tokens.ID \
                join log_line_identifiers on  log_line_identifiers.line_ID = log_line_tokens.line_ID \
                        order by log_line_identifiers.identifier_ID, log_line_identifiers.line_ID, word_id"):
    (word_id, identifier_id, line_id) = r
    if line_id != prev_line_id:
        raw_dataset.append(line_words)
        line_words = torch.zeros(words_num)
        prev_line_id = line_id
    line_words[word_id - 1] = 1.0
training_set = torch.stack(raw_dataset)

print(f"Training set shape: {training_set.shape}")

In [None]:
class LogLSTM(nn.Module):
    def __init__(self, input_dim=1171, hidden_dim=256, num_layers=2, dropout=0.2):
        super(LogLSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layer to predict next vector
        self.output_layer = nn.Linear(hidden_dim, input_dim)
        self.sigmoid = nn.Sigmoid()  # For binary prediction (0 or 1 for each word)
        
    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_dim)
        # LSTM output
        lstm_out, (hidden, cell) = self.lstm(x)
        
        # Take the last output from the sequence to predict the next vector
        # lstm_out shape: (batch_size, sequence_length, hidden_dim)
        last_output = lstm_out[:, -1, :]  # (batch_size, hidden_dim)
        
        # Predict next vector
        prediction = self.output_layer(last_output)  # (batch_size, input_dim)
        prediction = self.sigmoid(prediction)  # Apply sigmoid for binary prediction
        
        return prediction
    
    def init_hidden(self, batch_size, device):
        """Initialize hidden states"""
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return h0, c0



In [None]:
# Prepare sequences of length 50 for training
def create_sequences(data, sequence_length=50):
    """
    Create sequences of length sequence_length from the data
    Returns X (input sequences) and y (target vectors)
    """
    X, y = [], []
    
    for i in range(len(data) - sequence_length):
        # Input: sequence of 50 vectors
        sequence = data[i:i + sequence_length]
        # Target: the next vector after the sequence
        target = data[i + sequence_length]
        
        X.append(sequence)
        y.append(target)
    
    return torch.stack(X), torch.stack(y)

# Create sequences from training data
sequence_length = 50
print(f"Creating sequences of length {sequence_length} from training set...")
X, y = create_sequences(training_set, sequence_length)

print(f"Input sequences shape: {X.shape}")  # Should be (num_sequences, 50, 1171)
print(f"Target vectors shape: {y.shape}")   # Should be (num_sequences, 1171)
print(f"Number of training sequences: {len(X)}")

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = LogLSTM(input_dim=1171, hidden_dim=256, num_layers=2, dropout=0.2)
model.to(device)

print("Model architecture:")
print(model)


In [None]:
# Training setup
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Manual train/validation split function (no sklearn needed)
def manual_train_test_split(X, y, test_size=0.2, random_seed=42):
    """
    Manually split data into train and test sets
    """
    # Set random seed for reproducibility
    torch.manual_seed(random_seed)
    
    # Get total number of samples
    total_samples = len(X)
    test_samples = int(total_samples * test_size)
    
    # Create random indices
    indices = torch.randperm(total_samples)
    
    # Split indices
    test_indices = indices[:test_samples]
    train_indices = indices[test_samples:]
    
    # Split the data
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

# Split data into train and validation sets
X_train, X_val, y_train, y_val = manual_train_test_split(
    X, y, test_size=0.2, random_seed=42
)

print(f"Training set: {X_train.shape[0]} sequences")
print(f"Validation set: {X_val.shape[0]} sequences")

# Create data loaders
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss function and optimizer
criterion = nn.BCELoss()  # Binary cross-entropy for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

print("Training setup complete!")


In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=20):
    """
    Train the LSTM model
    """
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                val_loss += loss.item()
        
        # Calculate average losses
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        # Update learning rate
        scheduler.step()
        
        # Print progress
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    return train_losses, val_losses

# Start training
print("Starting training...")
train_losses, val_losses = train_model(
    model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=20
)


In [None]:
# Prediction function
def predict_next_vector(model, input_sequence, threshold=0.5):
    """
    Predict the next vector given an input sequence of 50 vectors
    
    Args:
        model: Trained LSTM model
        input_sequence: Tensor of shape (50, 1171) - sequence of 50 vectors
        threshold: Threshold for binary prediction (default 0.5)
    
    Returns:
        Predicted binary vector of shape (1171,)
    """
    model.eval()
    with torch.no_grad():
        # Add batch dimension: (1, 50, 1171)
        input_batch = input_sequence.unsqueeze(0).to(device)
        
        # Get prediction
        prediction = model(input_batch)  # Shape: (1, 1171)
        
        # Apply threshold to get binary prediction
        # binary_prediction = (prediction > threshold).float()
        
        return prediction.squeeze(0).cpu()  # Remove batch dimension and move to CPU

# Test prediction on a sample
if len(X_val) > 0:
    # Take the first validation sequence
    sample_sequence = X_val[0]  # Shape: (50, 1171)
    true_next = y_val[0]        # Shape: (1171,)
    
    # Predict
    predicted_next = predict_next_vector(model, sample_sequence)
    
    print("Sample prediction:")
    print(f"True next vector (first 10 elements): {true_next[:10].numpy()}")
    print(f"Predicted next vector (first 10 elements): {predicted_next[:10].numpy()}")
    
    # Calculate accuracy for this sample
    accuracy = (predicted_next - true_next).abs().float().mean()
    print(f"Prediction accuracy for this sample: {accuracy:.4f}")


In [None]:

# Save the trained model
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'input_dim': 1171,
        'hidden_dim': 256,
        'num_layers': 2,
        'dropout': 0.2
    },
    'train_losses': train_losses,
    'val_losses': val_losses
}, 'log_lstm_model.pth')

print("\\nModel saved as 'log_lstm_model.pth'")
print("Training complete!")


In [None]:
def load_model(model_path):
    """Load a saved model"""
    checkpoint = torch.load(model_path, map_location=device)
    
    config = checkpoint['model_config']
    model = LogLSTM(**config)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    
    return model, checkpoint

# Example usage (uncomment to use):
# loaded_model, checkpoint = load_model('log_lstm_model.pth')
# 
# # Use the loaded model for prediction
# sample_sequence = training_set[1000:1050]  # Take 50 consecutive vectors
# prediction = predict_next_vector(loaded_model, sample_sequence)
# print(f"Prediction shape: {prediction.shape}")
# print(f"Number of predicted active words: {prediction.sum().item()}")
