In [4]:
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Simulate real-world application logs
sequences = [
    [
         "Application started",
       "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
         "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
         "Application started",
       "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
         "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
         "Application started",
       "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
         "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ]
]

# Preprocessing function for logs
def preprocess_logs(sequence):
    preprocessed = []
    for log in sequence:
        log = re.sub(r'/\w+-endpoint', '/<ENDPOINT>', log)  # Normalize API endpoints
        log = re.sub(r'\bStep \d+:', '<STEP>', log)         # Normalize step numbers
        log = re.sub(r'timeout|crashed|successfully', '<STATUS>', log)  # Normalize outcomes
        preprocessed.append(log)
    return preprocessed

# Flatten logs and preprocess
flattened_logs = [log for sequence in sequences for log in preprocess_logs(sequence)]

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert individual logs into embeddings
log_vectors = model.encode(flattened_logs)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(log_vectors)

# Assign target values based on similarity
threshold = 0.8
final_targets = {}
current_target = 1

for i in range(len(log_vectors)):
    if i not in final_targets:
        final_targets[i] = current_target
        for j in range(i + 1, len(log_vectors)):
            if j not in final_targets and similarity_matrix[i][j] > threshold:
                final_targets[j] = current_target
        current_target += 1

# Prepare the DataFrame
final_data = {
    "Log": flattened_logs,
    "Target": [final_targets[i] for i in range(len(flattened_logs))]
}

df = pd.DataFrame(final_data)

# Function to get the target value for a specific row
def get_target(row_index):
    return df.loc[row_index, "Target"]

# Example usage
print("Final DataFrame:")
print(df)

# Get target for row 0
row_index = 0
print(f"Target for row {row_index}: {get_target(row_index)}")


Final DataFrame:
                                          Log  Target
0                  <STEP> Application started       1
1                       <STEP> User logged in       2
2        <STEP> API request made to /endpoint       3
3              <STEP> Database query executed       4
4            <STEP> Response returned to user       5
5                <STEP> User performed action       6
6   <STEP> Another API request to /<ENDPOINT>       3
7                        <STEP> Cache updated       7
8            <STEP> Service <STATUS> occurred       8
9                 <STEP> Application <STATUS>       9
10                 <STEP> Application started       1
11                      <STEP> User logged in       2
12       <STEP> API request made to /endpoint       3
13             <STEP> Database query executed       4
14           <STEP> Response returned to user       5
15               <STEP> User performed action       6
16  <STEP> Another API request to /<ENDPOINT>       3
17         

In [10]:
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Simulate real-world application logs
sequences = [
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "User logged out",
        "Application ended successfully"
    ]
]

# Preprocessing function for logs
def preprocess_logs(sequence):
    preprocessed = []
    for log in sequence:
        log = re.sub(r'/\w+-endpoint', '/<ENDPOINT>', log)  # Normalize API endpoints
        log = re.sub(r'timeout|crashed|successfully', '<STATUS>', log)  # Normalize outcomes
        preprocessed.append(log)
    return preprocessed

# Flatten logs and preprocess
flattened_logs = [log for sequence in sequences for log in preprocess_logs(sequence)]

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to embed individual words from logs
def embed_words(log):
    words = log.split()
    word_embeddings = model.encode(words)
    return word_embeddings

# Prepare word-level embeddings
word_level_embeddings = []
for log in flattened_logs:
    word_embeddings = embed_words(log)
    word_level_embeddings.append({"Log": log, "Word_Embeddings": word_embeddings})

# Convert individual logs into embeddings
log_vectors = model.encode(flattened_logs)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(log_vectors)

# Assign target values based on similarity
threshold = 0.8
final_targets = {}
current_target = 1

for i in range(len(log_vectors)):
    if i not in final_targets:
        final_targets[i] = current_target
        for j in range(i + 1, len(log_vectors)):
            if j not in final_targets and similarity_matrix[i][j] > threshold:
                final_targets[j] = current_target
        current_target += 1

# Prepare the DataFrame
final_data = {
    "Log": flattened_logs,
    "Target": [final_targets[i] for i in range(len(flattened_logs))],
    "Word_Embeddings": [embed_words(log) for log in flattened_logs]
}

df = pd.DataFrame(final_data)

# Function to get the target value for a specific row
def get_target(row_index):
    return df.loc[row_index, "Target"]

# Example usage
print("Final DataFrame:")
print(df)

# Get target for row 0
row_index = 5
print(f"Target for row {row_index}: {get_target(row_index)}")


Final DataFrame:
                                   Log  Target  \
0                  Application started       1   
1                       User logged in       2   
2        API request made to /endpoint       3   
3              Database query executed       4   
4            Response returned to user       5   
5                User performed action       6   
6   Another API request to /<ENDPOINT>       3   
7                        Cache updated       7   
8            Service <STATUS> occurred       8   
9                 Application <STATUS>       9   
10                 Application started       1   
11                      User logged in       2   
12       API request made to /endpoint       3   
13             Database query executed       4   
14           Response returned to user       5   
15               User performed action       6   
16  Another API request to /<ENDPOINT>       3   
17                       Cache updated       7   
18           Service <STATUS> occ

In [11]:
df.head()

Unnamed: 0,Log,Target,Word_Embeddings
0,Application started,1,"[[-0.033434853, 0.01059796, -0.061904375, -0.1..."
1,User logged in,2,"[[-0.054399997, 0.023594018, -0.032389987, -0...."
2,API request made to /endpoint,3,"[[-0.1123679, 0.02218953, -0.07871584, 0.01256..."
3,Database query executed,4,"[[0.04617131, -0.020743735, -0.092719115, 0.02..."
4,Response returned to user,5,"[[-0.03762575, 0.07458926, -0.015163289, 0.040..."


In [14]:
# Reshape log embeddings to add the required time dimension (timesteps = 1)
X = X.reshape(X.shape[0], 1, X.shape[1])  # Add time dimension
y = y.reshape(y.shape[0], y.shape[1])     # Target does not need time dimension

# Define the RNN model
model_rnn = Sequential([
    LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),  # input_shape=(timesteps, features)
    Dense(X.shape[2], activation='linear')  # Output embedding size
])

model_rnn.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the RNN
model_rnn.fit(X, y, epochs=20, batch_size=4, verbose=1)

# Predict the next log embedding for a given input (row 0 in this case)
input_row_index = 0
input_embedding = X[input_row_index].reshape(1, 1, X.shape[2])  # Add time dimension

predicted_embedding = model_rnn.predict(input_embedding)

# Find the closest log in the training set for the predicted embedding
similarities = cosine_similarity(predicted_embedding, log_embeddings)
predicted_index = np.argmax(similarities)

# Display the predicted log
predicted_log = df.loc[predicted_index, "Log"]
print(f"Predicted next log for row {input_row_index}: {predicted_log}")


Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0014 - mae: 0.0280
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 9.4249e-04 - mae: 0.0230
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.7563e-04 - mae: 0.0197 
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.5548e-04 - mae: 0.0183 
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.1442e-04 - mae: 0.0175 
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.8944e-04 - mae: 0.0171 
Epoch 7/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.1045e-04 - mae: 0.0158  
Epoch 8/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.6319e-04 - mae: 0.0147 
Epoch 9/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/

# with text input

In [27]:
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Simulate real-world application logs
sequences = [
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "Service timeout occurred",
        "Application crashed"
    ],
    [
        "Application started",
        "User logged in",
        "API request made to /endpoint",
        "Database query executed",
        "Response returned to user",
        "User performed action",
        "Another API request to /other-endpoint",
        "Cache updated",
        "User logged out",
        "Application ended successfully"
    ]
]

# Preprocessing function for logs
def preprocess_logs(sequence):
    preprocessed = []
    for log in sequence:
        log = re.sub(r'/\w+-endpoint', '/<ENDPOINT>', log)  # Normalize API endpoints
        log = re.sub(r'timeout|crashed|successfully', '<STATUS>', log)  # Normalize outcomes
        preprocessed.append(log)
    return preprocessed

# Flatten logs and preprocess
flattened_logs = [log for sequence in sequences for log in preprocess_logs(sequence)]

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Calculate log-level embeddings (mean of word embeddings for each log)
def calculate_log_embedding(log):
    words = log.split()
    word_embeddings = model.encode(words)
    return np.mean(word_embeddings, axis=0)  # Take the mean of all word embeddings

# Update DataFrame with log-level embeddings
log_embeddings = [calculate_log_embedding(log) for log in flattened_logs]

# Prepare data for RNN
def prepare_sequences(embeddings):
    X, y = [], []
    for i in range(len(embeddings) - 1):
        X.append(embeddings[i])  # Current log embedding
        y.append(embeddings[i + 1])  # Next log embedding
    return np.array(X), np.array(y)

# Prepare sequences
X, y = prepare_sequences(log_embeddings)

# Reshape X to add time dimension (timesteps = 1)
X = X.reshape(X.shape[0], 1, X.shape[1])

# Define the RNN model with advanced settings
model_rnn = Sequential([
    LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.3),  # Add dropout for regularization
    Dense(128, activation='relu'),
    Dense(X.shape[2], activation='linear')  # Output embedding size
])

# Use the full name for loss function
model_rnn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the RNN
model_rnn.fit(X, y, epochs=50, batch_size=8, verbose=1)  # Increased epochs and smaller batch size for better learning

# Save the model
model_rnn.save("log_prediction_rnn.h5")
print("Model saved as log_prediction_rnn.h5")

# Load the model
loaded_model = load_model("log_prediction_rnn.h5")
print("Model loaded successfully")


Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.0014 - mae: 0.0280
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0011 - mae: 0.0256
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 9.4344e-04 - mae: 0.0233
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 7.5588e-04 - mae: 0.0209 
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 6.4476e-04 - mae: 0.0195 
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 6.0504e-04 - mae: 0.0191 
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 5.6588e-04 - mae: 0.0184  
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 5.5431e-04 - mae: 0.0183 
Epoch 9/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11



Model saved as log_prediction_rnn.h5




Model loaded successfully


In [29]:
# Save the RNN model
model_rnn.save('rnn_log_model.h5')

# Save log embeddings and flattened logs for later use
import pickle
with open('log_data.pkl', 'wb') as f:
    pickle.dump({'log_embeddings': log_embeddings, 'flattened_logs': flattened_logs}, f)




In [30]:
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import load_model
from sentence_transformers import SentenceTransformer

# Load the saved RNN model
model_rnn = load_model('rnn_log_model.h5')

# Load log embeddings and flattened logs
with open('log_data.pkl', 'rb') as f:
    data = pickle.load(f)
log_embeddings = data['log_embeddings']
flattened_logs = data['flattened_logs']

# Reload the SentenceTransformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate log embedding
def calculate_log_embedding(log):
    words = log.split()
    word_embeddings = sentence_model.encode(words)
    return np.mean(word_embeddings, axis=0)

# Function to predict the next log
def predict_next_log(input_log):
    # Preprocess and calculate the embedding for the input log
    input_embedding = calculate_log_embedding(input_log).reshape(1, 1, -1)

    # Predict the next log embedding
    predicted_embedding = model_rnn.predict(input_embedding)

    # Find the closest log in the training set for the predicted embedding
    similarities = cosine_similarity(predicted_embedding, log_embeddings)
    predicted_index = np.argmax(similarities)

    # Return the predicted log
    return flattened_logs[predicted_index]




In [32]:
# Example usage
input_log = "User performed action"
predicted_log = predict_next_log(input_log)
print(f"Predicted next log for input '{input_log}': {predicted_log}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Predicted next log for input 'User performed action': Another API request to /<ENDPOINT>


In [33]:
!ls -lrt


total 8748
drwxr-xr-x 1 root root    4096 Jan 24 14:22 sample_data
-rw-r--r-- 1 root root 8901096 Jan 28 10:30 rnn_log_model.h5
-rw-r--r-- 1 root root   47716 Jan 28 10:30 log_data.pkl
