In [1]:
import numpy as np
import pandas as pd
import re

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
train = pd.read_pickle('./data/train.pkl')
test = pd.read_pickle('./data/test.pkl')

train.shape, test.shape

((472972, 2), (1418916, 1))

In [4]:
# expressions = {
#     r'\d+:\d+:\d+\S': '<TIME>',
#     r'\d+:\d+': '<RANGE>',
#     r'\d+-\d+-\d+\S': '<DAY>',
#     r'\S\S+[a-z] [\s\d]\d': '<MON> <DATE>',
#     r'\d+': '<NUM>',
#     'js:': 'js',
#     r'\\n': ' ',
#     '\t': ' ',
#     '"': '',
#     r':+\s': ' ',
#     ':': '=',
#     '{': '',
#     '}': '',
#     '(': '',
#     ')': '',
#     ',': ' ',
#     r'\s+': ' '
# }

expressions = {
    r'\d+:\d+:\d+\S': '<TIME>',
    r'\d+:\d+': '<RANGE>',
    r'\d+-\d+-\d+\S': '<DAY>',
    r'\S\S+[a-z] [\s\d]\d': '<MON> <DATE>',
    r'\d+': '<NUM>',
#     'js:': 'js',
#     r'\\n|\s+|,|:+\s': ' ',
    r'\\n|,|[[]|[]]|[=]|[:]': ' ',
    r'[{]|[}]|[(]|[)]|["]|[\\]+': '',
#     ':': '=',
}

def strip_strs(x):
    phrases = re.findall(r'"+[\S\s]+?"', x)
    for ph in phrases:
        x = x.replace(ph, ph.replace(' ', ''))
    return x

def convert(x):
    for f, t in expressions.items():
        x = re.sub(f, t, x)
    return x

def convert_df(df_, col='full_log'):
    df = df_.copy()
    df[col] = df[col].map(strip_strs)
    for f, t in expressions.items():
        df[col] = df[col].str.replace(f, t)
        
    return df

In [5]:
%%time
df = convert_df(train, 'full_log')
test_X = convert_df(test)
df

Wall time: 33min 12s


Unnamed: 0_level_0,level,full_log
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,<MON> <DATE> <TIME> localhost kibana type err...
1,0,<MON> <DATE> <TIME> localhost logstash <DAY>...
2,0,<MON> <DATE> <TIME> localhost kibana type err...
3,0,<MON> <DATE> <TIME> localhost kibana type err...
4,1,type SYSCALL msg audit<NUM>.<RANGE> arch c<NU...
...,...,...
472967,0,<MON> <DATE> <TIME> localhost logstash <NUM> ...
472968,1,type SYSCALL msg audit<NUM>.<RANGE> arch c<NU...
472969,0,<MON> <DATE> <TIME> localhost kibana type log...
472970,0,<MON> <DATE> <TIME> localhost kibana type err...


In [6]:
tr_sent = list(map(list, map(lambda x: filter(lambda y: len(y)>0, x.split(' ')), df['full_log'].values)))
test_sent = list(map(list, map(lambda x: filter(lambda y: len(y)>0, x.split(' ')), test_X['full_log'].values)))

In [8]:
import gensim

emb_dim = 64
w2v = gensim.models.Word2Vec(tr_sent, vector_size =emb_dim, sg=1, workers=4)

In [9]:
w2v.build_vocab(np.array(tr_sent))

In [10]:
w2v.train(np.array(tr_sent),
         total_examples = w2v.corpus_count,
         epochs=100,
         compute_loss=True)

(1183564425, 2260737100)

In [11]:
tr_sent[0]

['<MON>',
 '<DATE>',
 '<TIME>',
 'localhost',
 'kibana',
 'type',
 'error',
 '@timestamp',
 '<DAY><TIME>',
 'tags',
 'stats-collection',
 'pid',
 '<NUM>',
 'level',
 'error',
 'error',
 'message',
 'NoLivingconnections',
 'name',
 'Error',
 'stack',
 'Error',
 'NoLivingconnections',
 'atsendReqWithConnection/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js',
 '<RANGE>',
 'atnext/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js',
 '<RANGE>',
 'atprocess._tickCallbackinternal/process/next_tick.js',
 '<RANGE>',
 'message',
 'NoLivingconnections']

In [13]:
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index_to_key)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model.wv[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(sents, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(sents),num_features),dtype="float32")
    for sent in tqdm(sents):
        reviewFeatureVecs[counter] = featureVecMethod(sent, model, num_features)
        counter += 1

    return reviewFeatureVecs

In [14]:
emb_X = getAvgFeatureVecs(tr_sent, w2v, emb_dim)
# emb_test_X = getAvgFeatureVecs(test_sent, w2v, emb_dim)


100%|██████████| 472972/472972 [05:30<00:00, 1431.17it/s]


In [15]:
TEST_SIZE=0.2
RANDOM_SEED=42

tr_X, val_X, tr_y, val_y=train_test_split(emb_X, df['level'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

forest=RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')

forest.fit(tr_X, tr_y)

RandomForestClassifier(class_weight='balanced', n_jobs=-1)

In [17]:
preds=forest.predict(val_X)
probas=forest.predict_proba(val_X)

f1_score(val_y, preds, average='macro')

0.8512914017380779

In [18]:
preds[np.where(np.max(probas, axis=1)<0.7)]=7
new_crosstab = pd.crosstab(val_y, preds, rownames=['real'], colnames=['pred'])
new_crosstab


pred,0,1,2,3,4,5,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,66579,25,0,0,0,1,181
1,51,26416,0,0,0,0,80
2,0,0,2,0,0,0,0
3,1,0,0,797,0,0,3
4,0,0,0,0,2,0,1
5,6,1,0,0,0,437,9
6,0,0,0,0,0,0,3


In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import optimizers, callbacks

es = callbacks.EarlyStopping(patience=2, restore_best_weights=True)


In [20]:
nn = Sequential([
    Dense(emb_dim, activation='relu'),
    Dense(emb_dim//2, activation='relu'),
    Dense(7, activation='softmax')
])

In [24]:
nn.compile(loss='sparse_categorical_crossentropy', optimizer=optimizers.Adam(2e-4))

nn.fit(tr_X, tr_y,
      epochs=10,
      validation_data=(val_X, val_y),
      callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x191100f0d08>

In [25]:
preds=nn.predict(val_X)
# probas=forest.predict_proba(val_X)

f1_score(val_y, np.argmax(preds, 1), average='macro')

0.8508106311948229