In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -U tensorflow==2.0.0

import tensorflow as tf

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

In [3]:
# import the rare dataset
rare_ds = pd.read_csv("/kaggle/input/rare-kubernetes-anomalies/RARE.csv", sep="|", header=0)
rare_ds.head()
original_rare_ds = rare_ds.copy()

In [4]:
print(rare_ds.shape)

In [5]:
rare_ds.dtypes.unique()

We can observe that we have a time series dataset, which contains the timestamp as first column, and anomaly as second column. Thee is the isntance field, that contains a descriptive value of the cluster status on that moment.
Then, there are a set of prometheus metrics, collected for the specific timestamp. The anomaly is the field that we are going to use for training, and the other entries are numerical entries, that are going to influence the model.
We have a total of 10.010 rows for training, and a total of 7063 features, that will need to be selected.
Let's show the different values for the anomaly and instance field:

In [6]:
rare_ds.columns =[col.strip() for col in rare_ds.columns]
print(rare_ds["anomaly"].unique())
print(rare_ds["instance"].unique())

We can see that the anomaly is just a binary field, that we can use to know if there is an anomaly or not. For our case, we can drop the instance field, as it is not relevant for training based on anomalies.

In [7]:
rare_ds.drop(labels=['instance'], axis=1, inplace = True)


 Now we need to proceed with feature selection. We have a total of 7060 metrics and the objective is to select the relevant ones for the project. In order to do it, we could proceed with manual removal first, using the Kubernetes domain knowledge, to discard the non meaningful ones:
 - any information about resource creation is not relevant, because it depends on the workloads that are existing on each cluster
 - kafka topics, logs... are a consequence of other operations in the cluster, they cannot be part of cluster failures reason
 - any information related about pod status, container status.. that have some fixed id needs to be discarded, as it is something that will change depending on the workloads and cluster

In [8]:
final_columns = []
for column in sorted(rare_ds.columns):
    if not column.strip().startswith(("kafka_topic", "kafka_log", "kube_configmap", "kube_namespace", "kube_service", "kube_secret", "kube_pod_status", "kube_pod_container_status", "scrape_samples", "kafka_server_brokertopic")):
        final_columns.append(column.strip())

In [9]:
features_ds = rare_ds[final_columns].copy()
features_ds.shape

After manual selection we will use automated methods. First step would be to remove the constant features. Any metric that is having a constant value, with no relevant variance over the dataset, is a metric that is not useful to discriminate a target. We can find those constant values using variance:

In [10]:
features_ds.dtypes.unique()

In [11]:
# for all columns that are objects, conver to float
for column in features_ds.columns:
    if features_ds[column].dtype == "object":
        features_ds[column] = features_ds[column].astype(float)
features_ds.dtypes.unique()

In [12]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)
sel.fit(features_ds)

In [13]:
# get a list of the constant features. Those can be removed
print(
    len([
        x for x in features_ds.columns
        if x not in features_ds.columns[sel.get_support()]
    ]))

In [14]:
to_strip = [x for x in features_ds.columns if x not in features_ds.columns[sel.get_support()]]
features_ds_strip = features_ds.drop(labels=to_strip, axis=1)
features_ds_strip.shape

With that technique we have reduced the number of features to 1740. It is still a long number of features, so we need to apply more feature reduction techniques to select the most relevant ones. As we deal with a large amount of features, we will reduce them using Mutual Information technique:

In [15]:
X = features_ds_strip
Y = rare_ds["anomaly"].to_frame()
print(X.shape)
print(Y.shape)

In [16]:
# select with mutual information
from sklearn.feature_selection import SelectPercentile as SP
from sklearn.model_selection import train_test_split as tts
from sklearn.tree import DecisionTreeClassifier as DTC

selector = SP(percentile=2) # select features with top 2% MI scores
selector.fit(X,Y)
X_4 = selector.transform(X)
X_train_4,X_test_4,y_train,y_test = tts(
    X_4,Y
    ,random_state=0
    ,stratify=Y
)
model_4 = DTC().fit(X_train_4,y_train)
score_4 = model_4.score(X_test_4,y_test)

In [17]:
print(f"score_4:{score_4}")
print(X_4.shape)

In [18]:
# get columns, we have reduced to a total of 35
columns = np.asarray(X.columns.values)
support = np.asarray(selector.get_support())
columns_with_support = columns[support]
print(columns_with_support)

Using the Mutual Information method, for the 3% with bigger MI score, returned a total of 36 columns. With a more reasonable number of features, we can now use visualization to discard the highly correlated ones.

In [19]:
# with this reduced amount of features, now we can do a correlation map
import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt
x = rare_ds[columns_with_support]
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [20]:
common_features = ['java_lang_memorypool_committed_usage_g1_eden_space_0', 'java_lang_operatingsystem_freephysicalmemorysize_0', 'java_lang_operatingsystem_systemloadaverage_0', 'node_entropy_available_bits_0', 'node_filefd_allocated_0',
                   'node_load1_0', 'node_memory_Active_anon_bytes_0', 'node_memory_Active_file_bytes_0', 'node_memory_Cached_bytes_0', 'node_memory_Committed_AS_bytes_0', 'node_memory_Inactive_bytes_0', 'node_memory_KernelStack_bytes_0',
                   'node_memory_MemAvailable_bytes_0', 'node_memory_MemFree_bytes_0', 'node_memory_PageTables_bytes_0', 'node_memory_SReclaimable_bytes_0', 'node_memory_SUnreclaim_bytes_0', 'node_memory_Shmem_bytes_0', 'node_nf_conntrack_entries_0',
                   'node_nf_conntrack_entries_1', 'node_nf_conntrack_entries_2', 'node_sockstat_TCP_alloc_0', 'node_sockstat_sockets_used_0']

Once we have reduced the relevant features to a reasonable number, we can proceed to train the model, having in consideration that we have labeled data, in a time series format. To train it, there are several options - some of the common ones are neural networks such as LSTM. We will start by scaling the features:

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_np = scaler.fit_transform(rare_ds[common_features])
scaled_df = pd.DataFrame(scaled_np, columns=common_features)
scaled_df.head()

In [22]:
# add the time and anomaly
final_df = pd.concat([rare_ds[['time', 'anomaly']], scaled_df], axis=1)
final_df.head()

Next thing will be to convert the timestamp to datetime, and then we need to prepare data to have regular intervals. We check the interval time, and we can see that we have information for over 15 hours, for one day. We can see that the data is already divided into regular intervals of one second.

In [23]:
final_df['time'] = pd.to_datetime(final_df['time'], unit='s')
print(final_df["time"].head())
print(min(final_df["time"]))
print(max(final_df["time"]))

The next step is to tranform this data into a time series problem. In orer to do that, first we need to decide the time window. This is where the data we dropped before - with Instance information - can be useful, because it shows the time where the failure was introduced, and the time it took to fail. Let's plot it: 

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11, 4)})
original_rare_ds.columns =[col.strip() for col in original_rare_ds.columns]
original_rare_ds['time'] = pd.to_datetime(original_rare_ds['time'], unit='s')
original_df = original_rare_ds.set_index("time")
original_df['factor'] = pd.factorize(original_df["instance"])[0]
original_df['factor'].plot(linewidth=0.5);

In [25]:
original_df[['factor', 'anomaly']].loc['2020-01-14 20:17:00':'2020-01-14 21:17:00'].plot(linewidth=0.5);

In [26]:
original_df[['factor', 'anomaly']].loc['2020-01-14 20:24:00':'2020-01-14 20:32:00'].plot(linewidth=0.5);

We can see that the tests have been executed in time windows of 7:30 minutes. The lag will be over 1:40. So we will take that as our window for LSTM. Let's create the function first:

In [27]:
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    cols, names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [28]:
# as data is in seconds, the window will be calculated in this unit
window = int(7.5*60.0)
lag = 100
series = series_to_supervised(final_df.drop('time', axis=1), window=window, lag=lag)
series.head()

* Once we have the data as a time serie, is time to divide into train and test. In this case, we are going to use the first measures in time as train and the last measures as test. So we cannot shuffle our data.

In [29]:
from sklearn.model_selection import train_test_split

# Label
labels_col = 'anomaly(t+%d)' % lag
labels = series[labels_col]
series = series.drop(labels_col, axis=1)

X_total, X_test, Y_total, Y_test = train_test_split(series, labels.values, test_size=0.2, random_state=0, shuffle=False)
print(X_test.shape)
print(Y_test.shape)

In [30]:
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.2, random_state=0, shuffle=False)
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)

Once we have the data in the proper shape, and it is divided properly into train/validation/test, is time to start applying a model. We are going to use an LSTM model, that will be able to pick information from a sequence and will be able to learn patterns, specially if they are long sequences. 

In [31]:
# prepare to apply LSTM
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
print('Train set shape', X_train_series.shape)
print('Validation set shape', X_valid_series.shape)

In [32]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping

earlyStop=EarlyStopping(monitor="val_loss",verbose=0,mode='min',patience=3)

epochs = 10
batch = 256
n_neurons=10

In [33]:
model_lstm = Sequential()
model_lstm.add(LSTM(units=n_neurons, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', validation_data=(X_valid_series, Y_valid), metrics=['accuracy'])
model_lstm.summary()

In [None]:
lstm_history = model_lstm.fit(X_train_series, Y_train, validation_data=(X_valid_series, Y_valid), epochs=epochs, batch_size=batch, verbose=1, callbacks=[earlyStop])

In [None]:
import plotly.graph_objs as go
