## Log Classification Using LSTM

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import keras

from keras.models import Model
from keras.layers import LSTM, Input, Dense, Dropout, Embedding
from keras.optimizers import Adam

from keras.utils import to_categorical
import re

## Read the Data

In [6]:
df = pd.read_csv('./combined_logs_with_labels.csv',delimiter=',', encoding='latin-1')
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,Log,Label
0,143 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.19.102:54106 dest: /10.250.19.102:50010,Normal
1,35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar.,Normal
2,143 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.10.6:40524 dest: /10.250.10.6:50010,Normal
3,145 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.14.224:42420 dest: /10.250.14.224:50010,Normal
4,145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block terminating,Normal


## Model

In [9]:
# making it TF.Data and spliting into 2 datasets
df['Label'] = df['Label'].map({'Normal':0,'Anomaly':1})

##### Train-test split ensuring 80% of the anomolous data is within the train dataset

In [12]:
# Shuffle the entire DataFrame
df = df.sample(frac=1, random_state=38).reset_index(drop=True)  # Shuffle the entire dataset

# Total samples
total_samples = len(df)
train_size = int(total_samples * 0.8)  # 80% of the entire dataset for training

# Separate the classes
normal_samples = df[df['Label'] == 0]
anomaly_samples = df[df['Label'] == 1]

#print(total_samples) # 19412
#print(train_size) # 15529
#print(len(normal_samples)) # 17220
#print(len(anomaly_samples)) # 2192

# Determine the number of Anomaly samples for the training set
anomaly_train_size = int(len(anomaly_samples) * 0.8)  # 80% of Anomaly samples
normal_train_size = train_size - anomaly_train_size  # Remaining from Normals

# Sample from each class
normal_train = normal_samples.sample(normal_train_size, random_state=38)
anomaly_train = anomaly_samples.sample(anomaly_train_size, random_state=38)
#print(len(normal_train)) # 13776
#print(len(anomaly_train)) # 1753

# Combine the training samples
train_df = pd.concat([anomaly_train, normal_train])
# Remaining as test samples
test_df = pd.concat([anomaly_samples, normal_samples]).drop(train_df.index)




##### Handling class imbalance

In [15]:
normal = train_df['Label'].value_counts()[0]
anomaly = train_df['Label'].value_counts()[1]
#normal, anomaly
total = normal + anomaly
weight_for_0 = (1 / normal) * (total) / 2.0
weight_for_1 = (1 / anomaly) * (total) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1} #create a dictionary

print("Weight for class 0: {:.2f}".format(weight_for_0))
print("Weight for class 1: {:.2f}".format(weight_for_1))

Weight for class 0: 0.56
Weight for class 1: 4.43


In [17]:
seed = 42
train_dataset = tf.data.Dataset.from_tensor_slices((train_df['Log'].values, train_df['Label'].values))
train_dataset = train_dataset.shuffle(buffer_size=len(df), seed=seed)
test_dataset = tf.data.Dataset.from_tensor_slices((test_df['Log'].values, test_df['Label'].values))

In [19]:
# Define the maximum number of words in your vocabulary
max_words = 10000
# Define the sequence length
max_len = 100

##### Text Vectorization Layer

In [22]:
from keras.layers import TextVectorization

# Create a TextVectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_words,
    output_mode='int',
    output_sequence_length=max_len)

In [24]:
# Adapt the layer to your training data (this builds the vocabulary)
text_ds = train_dataset.map(lambda x, y: x) #throw away y
vectorize_layer.adapt(text_ds.batch(128))

In [25]:
# batch & prefetch the dataset
train_dataset = train_dataset.batch(128).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(128).prefetch(tf.data.AUTOTUNE)

##### LSTM

In [29]:
Inp = Input(shape=(1,),dtype=tf.string, name='text_input')
x = vectorize_layer(Inp)
x = Embedding(max_words, 48, input_length=max_len, name="embedding")(x) #48 is the representation
x = LSTM(64, name="LSTM")(x) 
x = Dense(256, activation='relu', name='Dense_01')(x)
x = Dropout(0.7, name='Dropout1')(x)
x = Dense(256, activation='relu', name='Dense_02')(x)
x = Dropout(0.6, name='Dropout2')(x)
out = Dense(1, activation='sigmoid', name='output')(x)



In [31]:
model = Model(inputs=Inp,outputs=out)

In [33]:
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [35]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)
#prevent overfitting

In [37]:
model.fit(train_dataset,
          epochs=10,
          validation_data=test_dataset,
          class_weight = class_weight,
          callbacks=[early_stopping]
          )

Epoch 1/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 139ms/step - accuracy: 0.6308 - loss: 0.6952 - val_accuracy: 0.1131 - val_loss: 0.6996
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 122ms/step - accuracy: 0.4334 - loss: 0.7004 - val_accuracy: 0.8869 - val_loss: 0.6843
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 114ms/step - accuracy: 0.4586 - loss: 0.7076 - val_accuracy: 0.8869 - val_loss: 0.6803
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 117ms/step - accuracy: 0.7367 - loss: 0.6925 - val_accuracy: 0.8869 - val_loss: 0.6868


<keras.src.callbacks.history.History at 0x211de59c290>

In [39]:
model.evaluate(test_dataset)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.6675 - loss: 0.6894


[0.6843394637107849, 0.8869431018829346]

## Predicting unseen logs

In [41]:
# This log event is supposed to be Normal
text = ['INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.66.102:50010 is added to size 67108864']

model.predict(tf.constant(text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 575ms/step


array([[0.49426717]], dtype=float32)

In [51]:
# This log event is supposed to be an Anomaly
text = ['INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000074_2/part-00074.']

model.predict(tf.constant(text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


array([[0.49426717]], dtype=float32)

##### From the results, this model does not seemed to perform well in prediction