In [1]:
import numpy as np
import pandas as pd
import re

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [4]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold


In [5]:
train = pd.read_pickle('./data/train.pkl')
test = pd.read_pickle('./data/test.pkl')

train.shape, test.shape

((472972, 2), (1418916, 1))

In [7]:
expressions = {
    r'\d{2}:\d{2}:\d{2}': '<TIME>',
    r'\d+:\d+:\d+': '<RANGE>',
    r'\d+:\d+': '<RANGE>',
    r'\d+-\d+-\d+\S': '<DAY>',
    r'[A-Z][a-z]{2} [\s\d]\d': '<MON> <DATE>',
#     r'\d+': '<NUM>',
#     'js:': 'js',
#     r'\\n|\s+|,|:+\s': ' ',
    r'\\n|[,]|[[]|[]]': ' ', # |[=]|[:] 
    r'[{]|[}]|[(]|[)]|["]|[\\]+': '',
#     ':': '=',
}

def strip_strs(x):
    phrases = re.findall(r'"+[\S\s]+?"', x)
    for ph in phrases:
        x = x.replace(ph, ph.replace(' ', ''))
    return x

def convert(x):
    for f, t in expressions.items():
        x = re.sub(f, t, x)
    return x

def convert_df(df_, col='full_log'):
    df = df_.copy()
    df[col] = df[col].map(strip_strs)
#     df[col] = df[col].str.replace(r'system_u:object_r:bin_t:s0', '<SYSTEM>')
    df[col] = df[col].str.replace(r'audit\(\w+\.\w+:\w+\):', '<audit_LISTEN>')
    df[col] = df[col].str.replace(r'http[s]?[:\/\/]+[\d|.|\S]+[/]', '<URL>')
    df[col] = df[col].str.replace(r'\d+\.\d+\.\d+\.\d+[:\d+]?', '<URL>')
#     df[col] = df[col].str.replace(r'\d+\.\d+\.\d+\.\d+', '<URL>')
    
    for f, t in expressions.items():
        df[col] = df[col].str.replace(f, t)
        
    df[col] = df[col].str.replace('<DAY><TIME>', '<DAY> <TIME>')
#     df[col] = df[col].str.replace('@timestamp:<DAY>', '@timestamp: <DAY>')
    df[col] = df[col].str.replace(r'\S<DAY>', ' <DAY>')
    df[col] = df[col].str.replace(r'>\d?', '> ')
    
    df[col] = df[col].str.replace(r' \d+ ', ' <NUM> ')
    
    df[col] = df[col].str.replace('  ', ' ')
    return df

# re.findall(r'[a-z]+_u:[a-z]+_r:\w+:\w+', test_X['full_log'].values[41])

In [8]:
%%time
df = convert_df(train, 'full_log')
test_X = convert_df(test)
# df

Wall time: 3min 44s


In [None]:
tr_X, val_X, tr_y, val_y = train_test_split(df['full_log'], df['level'], 
                                            test_size=0.2, 
                                            random_state=SEED,
                                           stratify=df['level'].values)


In [9]:
tr_X = df['full_log']
tr_y = df['level']

test_X = test_X['full_log']

In [13]:
%%time
MIN_WORDS = 50

tok = tf.keras.preprocessing.text.Tokenizer(filters='')
tok.fit_on_texts(tr_X)
counter = tok.word_counts

tr_X.iloc[:] = list(map(lambda x: ' '.join(x), map(lambda x: list(filter(lambda x: len(x) > 1 and x in counter.keys() and counter[x] > MIN_WORDS, x.lower().split(' '))), tr_X)))
# val_X.iloc[:] = list(map(lambda x: ' '.join(x), map(lambda x: list(filter(lambda x: len(x) > 1 and x in counter.keys() and counter[x] > MIN_WORDS, x.lower().split(' '))), val_X)))
test_X.iloc[:] = list(map(lambda x: ' '.join(x), map(lambda x: list(filter(lambda x: len(x) > 1 and x in counter.keys() and counter[x] > MIN_WORDS, x.lower().split(' '))), test_X)))


Wall time: 1min 29s


In [14]:
import gc

gc.collect()

20

In [15]:
tok = tf.keras.preprocessing.text.Tokenizer(filters='')
tok.fit_on_texts(tr_X)

In [18]:
x_train = tok.texts_to_sequences(tr_X)
# x_val = tok.texts_to_sequences(val_X)
x_test = tok.texts_to_sequences(test_X)

maxlen=100

x_train_vector = tf.keras.preprocessing.sequence.pad_sequences(
    x_train, maxlen=maxlen, padding='post', truncating='post'
)

# x_val_vector = tf.keras.preprocessing.sequence.pad_sequences(
#     x_val, maxlen=maxlen, padding='post', truncating='post'
# )

x_test_vector = tf.keras.preprocessing.sequence.pad_sequences(
    x_test, maxlen=maxlen, padding='post', truncating='post'
)

In [19]:
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow_addons.layers import MultiHeadAttention

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, head_size=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att([inputs, inputs])
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [20]:
# maxlen = 150
emb_dim = 256
vocab_size = len(tok.word_index)+1
num_heads = 8  # Number of attention heads
ff_dim = 512  # Hidden layer size in feed forward network inside transformer
num_blocks = 4

embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, emb_dim)
transformer_blocks = [
    TransformerBlock(emb_dim, num_heads, ff_dim) for _ in range(num_blocks)]

inputs = layers.Input(shape=(maxlen,))

x = embedding_layer(inputs)
for i in range(num_blocks):
    x = transformer_blocks[i](x)
    
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(7, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 256)          483584    
_________________________________________________________________
transformer_block (Transform (None, 100, 256)          2361344   
_________________________________________________________________
transformer_block_1 (Transfo (None, 100, 256)          2361344   
_________________________________________________________________
global_average_pooling1d (Gl (None, 256)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)               

In [21]:
class_weight = Counter(train['level'].values)
s = sum(class_weight.values())
for k in class_weight.keys():
    class_weight[k] = 1/class_weight[k] * s/2
class_weight

Counter({0: 0.7079041503898942,
         1: 1.7845710361689444,
         3: 57.10842791596233,
         5: 106.57323118521857,
         2: 19707.166666666664,
         4: 23648.600000000002,
         6: 29560.75})

In [22]:
import math

class CosineAnnealingLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, t_batch, lr_max, lr_min, n_cycle):
        self.t_batch = t_batch
        self.lr_max = lr_max
        self.lr_min = lr_min
        self.n_cycle = n_cycle
    
    def __call__(self, step):
        cos_inner = (math.pi * (step % (self.t_batch*self.n_cycle))) / (self.t_batch*self.n_cycle)
        
        return (self.lr_max - self.lr_min)/2 * (tf.math.cos(cos_inner) + 1) + self.lr_min
    
max_rate = 1e-5
min_rate = 1e-6
cycle = 5
batch_size = 128

lr = CosineAnnealingLearningRateSchedule(len(x_train_vector)//batch_size+1, max_rate, min_rate, cycle)


In [23]:
es = keras.callbacks.EarlyStopping(patience=5, 
                                   restore_best_weights=True, 
                                   monitor='val_f1_score',
                                  mode='max')


In [29]:
model.compile(tf.optimizers.Adam(lr), 
              "categorical_crossentropy", 
              metrics=["accuracy" , tfa.metrics.F1Score(7, 'macro')])

history = model.fit(
    x_train_vector, keras.utils.to_categorical(tr_y, 7), 
    epochs=5,
    batch_size = batch_size,
#     validation_data = (x_val_vector, keras.utils.to_categorical(val_y, 7)),
#     callbacks=[es],
#     class_weight=class_weight
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
pred = model.predict(x_val_vector)

In [None]:
f1_score(val_y, np.argmax(pred, 1), average='macro')

In [30]:
pred = model.predict(x_test_vector, batch_size=128)
result = np.argmax(pred, 1)

In [31]:
result = np.argmax(pred, 1)
result[np.where(np.max(pred, 1) < 0.7)] = 7
np.unique(result)

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [32]:
sum(np.max(pred, 1)<0.7)

3107

In [33]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='id')
sub['level'] = result
sub

Unnamed: 0_level_0,level
id,Unnamed: 1_level_1
1000000,0
1000001,0
1000002,1
1000003,0
1000004,1
...,...
2418911,0
2418912,0
2418913,1
2418914,0


In [34]:
sub.to_csv('./transformer_2_0.7.csv')