In [20]:
import torch.nn.functional as F
import torch
import torch.nn as nn

#torch.set_default_tensor_type('torch.cuda.LongTensor')

class Seq2seq(nn.Module):

    def __init__(self, voc_size, embedding_size, hidden_size, window_size, device):
        super().__init__()
    
        self.embedding = nn.Embedding(voc_size, embedding_size)
        self.enc_lstm = nn.LSTM(embedding_size, hidden_size, num_layers=3,batch_first = True)
        self.enc_out = nn.Linear(hidden_size, hidden_size)

        self.dec_lstm = nn.LSTM(embedding_size, hidden_size, num_layers=3, batch_first = True)
        self.dec_out = nn.Linear(hidden_size, voc_size)

        self.start_of_string_token = 76
        self.window_size = window_size
        
        self.device=self.set_device(device)

    def Encoder(self, x):
        embedded = self.embedding(x) ## View not needed
        output = embedded 
    
        output, hidden =self.enc_lstm(output)
    
    #output, _ = self.enc_lstm(output)
    #output = output[:,-1,:]
    ## You could also take the hidden representation here and make that he output
    ## But I think the important part is just that the input sequence is encoded into some representation
    ## And this representation can either be the output of the last LSTM cell or the hidden state in that LSTM cell
    #output = self.enc_out(output) ## Not necessary, can take output from lstm, or hidden output from lstm
        return hidden

    def Decoder(self, x, hidden):
        output = self.embedding(x) ## View not necessary

    #c0 = torch.zeros_like(hidden)
        
    #output, hidden = self.dec_lstm(output, (hidden, c0))
        output, hidden = self.dec_lstm(output, hidden)
        output = self.dec_out(output) ## No zeroth element
        return output, hidden
    
    def set_device(self, gpu=-1):
        if gpu!=-1 and torch.cuda.is_available():
            device=torch.device('cuda: ' + str(gpu))
        else:
            device=torch.device('cpu')
        return device

    def forward(self, x):
        x=x.to(self.device)
        enc_hidden = self.Encoder(x)

    #enc_hidden = torch.unsqueeze(enc_hidden, dim = 0)
        enc_h = enc_hidden[0]
        enc_cs = enc_hidden[1]

    ## During training we can use the gold standard inputs
    ## Create the start of string token
        start_of_string_t = torch.tensor(x.shape[0]*[self.start_of_string_token]).view(1,-1).to(self.device) #.cuda()
    
    ## Concatenate it with the real inputs
        x_start = torch.cat((start_of_string_t,x[:,:-1].t())).t().to(self.device)#.cuda()
        x,_ = self.Decoder(x_start, (enc_h, enc_cs))
        return x

    
  
  ## When we predict then we do not want to feed the sequence to the decoder
  ## But instead use the output sequentially
    def predict(self, x):
        x=x.to(self.device)
        enc_hidden = self.Encoder(x)
    #enc_hidden = torch.unsqueeze(enc_hidden, dim = 0)
        enc_h=enc_hidden[0]
        enc_c=enc_hidden[1]
        x_start = torch.tensor([self.start_of_string_token]).view(1,-1).to(self.device) #.cuda()
    
        output = self.embedding(x_start)
    #c0 = torch.zeros_like(enc_hidden)
    #output, hidden = self.dec_lstm(output, (enc_hidden, c0))
        output, hidden = self.dec_lstm(output, (enc_h, enc_c))
        output = self.dec_out(output) ## No zeroth element

        log_k = torch.argmax(output, dim = 2)
    
    ## Store all the obtained log_keys. Can be done in a more efficient way 
        hidden=enc_hidden
        all_log_k = [log_k.item()]
        for i in range(1, self.window_size):
            output = self.embedding(log_k)
            output, hidden = self.dec_lstm(output, hidden)
            output = self.dec_out(output)
            log_k = torch.argmax(output, dim = 2)
            all_log_k.append(log_k.item())
    
        return all_log_k


    def evaluate(self, x):
        x=x.to(self.device)
        
        # run sequence through encoder
        enc_hidden = self.Encoder(x)

        # save the hidden states for the first decoder LSTM cell
        enc_h=enc_hidden[0]
        enc_c=enc_hidden[1]
        
        # create the start of string tokens
        start_of_string_t = torch.tensor(x.shape[0]*[self.start_of_string_token]).view(1,-1).to(self.device)
        
        for i in range(self.window_size):
            
          # take latest output as input
          output=self.embedding(start_of_string_t)#[:,-1])
          out, (enc_h, enc_c)= self.dec_lstm(output.unsqueeze(0), (enc_h, enc_c))
        
          out=self.dec_out(out)

          start_of_string_t=torch.cat((start_of_string_t, torch.argmax(out, dim=2)[:,-1].view(-1,1)), dim=1)
        
        return start_of_string_t

In [21]:
import torch.utils.data as data
class Dataset(data.Dataset):
    def __init__(self, encoder_series):
        self.encoder_series=encoder_series
        self.num_total_seqs=len(self.encoder_series)
        
    def __len__(self):
        return self.num_total_seqs
        
    def __getitem__(self, index):
        encoder_input=self.encoder_series.iloc[index]
        
        return torch.LongTensor(encoder_input) #.cuda()
    
def dataloader_(encoder_series, batch_size):
    dataset=Dataset(encoder_series)
    
    data_loader=torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size,
                                           shuffle=False, drop_last=True)
    return data_loader

# Input

In [22]:
import pandas as pd

hdfs_train=pd.read_pickle('C:\\Users\\A373503\\Desktop\\Complete_4_models\\HDFS\\Transformer\\No_missing_windows_version\\train_normal_data_pytorch_05_17.pkl')

word2id=pd.read_pickle('C:\\Users\\A373503\\Desktop\\Complete_4_models\\HDFS\\Data\\word2id_train_normal.pkl')
#hdfs_test_anomalies=pd.read_pickle('anomalies_test_pytorch_autoencoder.pkl')
#hhfs_test_normal=pd.read_pickle('normal_test_pytorch_autoencoder.pkl')

## Below we are initializing the special tokens
word2id['<unk>']=len(word2id)+1
word2id['<sos>']=len(word2id)+1
word2id['<pad>']=len(word2id)+1

#df_train=df_train['Windows']

## Specify device (gpu if available, otherwise cpu)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
hdfs_train

Unnamed: 0,Windows
0,"[2, 2, 2, 12, 3, 4, 3, 4, 3, 4]"
1,"[2, 2, 12, 3, 4, 3, 4, 3, 4, 5]"
2,"[2, 12, 3, 4, 3, 4, 3, 4, 5, 5]"
3,"[12, 3, 4, 3, 4, 3, 4, 5, 5, 5]"
4,"[3, 4, 3, 4, 3, 4, 5, 5, 5, 6]"
...,...
4688012,"[9, 9, 9, 10, 6, 6, 6, 7, 7, 8]"
4688013,"[2, 2, 12, 2, 3, 4, 3, 4, 3, 4]"
4688014,"[2, 12, 2, 3, 4, 3, 4, 3, 4, 5]"
4688015,"[12, 2, 3, 4, 3, 4, 3, 4, 5, 5]"


In [24]:
len(word2id)

54

In [25]:
src_pad_idx=word2id['<pad>']
trg_pad_idx=word2id['<pad>']
src_vocab_size=len(word2id)
trg_vocab_size=len(word2id)
window_size=10

In [26]:
df_test=pd.read_pickle('C:\\Users\\A373503\\Desktop\\Complete_4_models\\HDFS\\Transformer\\No_missing_windows_version\\transformer_normal_test.pkl')
df_test.head()

Unnamed: 0,Blockid,Preprocess_to_log_lines,labels,Length,EventSequence
525055,blk_5604007695029308798,[_info_dfs_datanode_dataxceiver_receiving_bloc...,0,13,"[2, 2, 11, 2, 3, 4, 3, 4, 3, 4, 5, 5, 14]"
446838,blk_-8684220662358283959,[_info_dfs_datanode_dataxceiver_receiving_bloc...,0,19,"[2, 2, 11, 2, 3, 4, 3, 4, 3, 4, 5, 5, 5, 6, 6,..."
146810,blk_1749152527273777138,[_info_dfs_datanode_dataxceiver_receiving_bloc...,0,25,"[2, 12, 2, 2, 3, 4, 3, 4, 3, 4, 5, 5, 5, 9, 9,..."
504536,blk_-7575024238572130397,[_info_dfs_datanode_dataxceiver_receiving_bloc...,0,13,"[2, 2, 11, 2, 3, 4, 3, 4, 3, 4, 5, 5, 14]"
411686,blk_-2768058268628005360,[_info_dfs_datanode_dataxceiver_receiving_bloc...,0,19,"[2, 2, 2, 12, 3, 4, 3, 4, 3, 4, 5, 5, 5, 6, 6,..."


In [27]:
hdfs_train['Windows']

0          [2, 2, 2, 12, 3, 4, 3, 4, 3, 4]
1          [2, 2, 12, 3, 4, 3, 4, 3, 4, 5]
2          [2, 12, 3, 4, 3, 4, 3, 4, 5, 5]
3          [12, 3, 4, 3, 4, 3, 4, 5, 5, 5]
4           [3, 4, 3, 4, 3, 4, 5, 5, 5, 6]
                        ...               
4688012    [9, 9, 9, 10, 6, 6, 6, 7, 7, 8]
4688013    [2, 2, 12, 2, 3, 4, 3, 4, 3, 4]
4688014    [2, 12, 2, 3, 4, 3, 4, 3, 4, 5]
4688015    [12, 2, 3, 4, 3, 4, 3, 4, 5, 5]
4688016    [2, 3, 4, 3, 4, 3, 4, 5, 5, 14]
Name: Windows, Length: 4688017, dtype: object

In [28]:
df_train=hdfs_train['Windows'].sample(frac = 0.01)
train_loader=dataloader_(df_train, 128)


#df_test=hdfs_train[3865473:]['Windows'].sample(frac = 0.01)

#test_loader=dataloader_(df_test, len(df_test))

In [29]:
import time
start_time = time.time()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
voc_size = len(word2id)
batch_size = 128
window_size = 10
#batches_in_epoch = 50
embedding_size = 128
hidden_size = 128
num_epochs = 4

seq2seq = Seq2seq(voc_size, embedding_size, hidden_size, window_size, device)
#mod=my_auto.cuda()

opt = torch.optim.SGD(seq2seq.parameters(), lr = 1e-1, momentum = 0.8)
loss_fn = nn.CrossEntropyLoss()

data_loader=dataloader_(df_train, batch_size)

for epoch in range(num_epochs):
    acc=0
    for b in data_loader:
        seq2seq.zero_grad()
        x = b
        y = x
        out = seq2seq(x) #.cuda()
        #print(out)
        #print(len(out))
        out = out.reshape(batch_size*window_size, -1)
        targets = y.reshape(batch_size*window_size)
        
        #print("predicted:")
        #print(torch.argmax(out, dim=1))
        #print("targets:")
        #print(targets)
        #print("equal:")
        accuracy=torch.sum(torch.argmax(out, dim=1)==targets)/out.size(0) #((targets==torch.argmax(out, dim=1)).sum())/len(targets)
        acc+=accuracy.item()
        loss = loss_fn(out, targets)
        
        
        loss.backward()
        opt.step()
        
    #with torch.no_grad():
    #    sum_=0
     #   samples=10
      #  for d in range(samples):
                #i=random.randint(0,len(data_not_trained_on)-1)
       #     x_predict = torch.LongTensor(df_test.iloc[d]).view(1, -1).to(device) #.cuda()
    
            #if d%(samples/10)==0:
            #    print('Predicted {} : Input {}'.format(seq2seq.predict(x_predict), x_predict.tolist()[0]))
        #    sum_+=sum([a == b for a, b in zip(seq2seq.predict(x_predict),x_predict.tolist()[0])])/len(x_predict.tolist()[0])
        #print("Validation accuracy: {}".format(sum_/samples))
    #print("Accuracy: {}".format(sum_/samples))
        
    print('Epoch {} Loss {}, Accuracy {}'.format(epoch, loss.item(), acc/len(data_loader)))
    print("---The runing time is: %s seconds ---" % (time.time() - start_time))       


IndexError: index out of range in self

# Save the model 

In [None]:
#torch.save(seq2seq.state_dict(), 'C:\\Users\A373503\\Desktop\\HDFS_04_11_deeplogmethod_finished\\seq2seq')
torch.save(seq2seq.state_dict(), 'C:\\Users\A373503\\Desktop\\HDFS_04_11_deeplogmethod_finished\\seq2seq_entire_testdata')

# Load the model

In [None]:
model = torch.load('C:\\Users\A373503\\Desktop\\HDFS_04_11_deeplogmethod_finished\\pytorch\\seq2seq_entire_testdata')


# Anomaly detection

In [None]:
import pandas as pd


In [None]:
df_test=pd.read_pickle('C:\\Users\\A373503\\Desktop\\Complete_4_models\\HDFS\\Data\\HDFS_test_data_mix_training_pytorch.pkl')
print("The size of the test data is:",len(df_test))
df_test.head()

In [None]:
df_test=df_test[:1000]

In [None]:
def transform_sequences_of_events(log_key_sequence, window_size, step):
    XX=[]
    #y=[]
    for i in range(0, len(log_key_sequence) - window_size, step):
        sentence = log_key_sequence[i:i + window_size]
        next_word = log_key_sequence[i + window_size]
        XX.append(sentence)
        #y.append(next_word)
    return XX

In [None]:


TP=0
FP=0
TN=0
FN=0
accs=list()


for s in range(len(df_test)):
    
    #print("check:",s)
    sequence=df_test.EventSequence.iloc[s]
    label=int(df_test.labels.iloc[s])
    file_input=[]
    
    file_input.extend(transform_sequences_of_events(sequence, 10, 1))
        
    file_input=pd.Series(file_input)
    file_df=pd.DataFrame(file_input, columns=['Windows'])
    file=file_df['Windows']

    
    with torch.no_grad():
        
        acc=0
        for window in range(len(file)):
            x_predict = torch.LongTensor(file.iloc[window]).view(1, -1).to(device) #.cuda()
    
            acc+=sum([a == b for a, b in zip(seq2seq.predict(x_predict),x_predict.tolist()[0])])/len(x_predict.tolist()[0])

 

        accuracy=acc/len(file)
        accs.append(accuracy)
        #print("check:",s,accuracy)
            
    
    #dataloader_file=dataloader_(file, len(file))
    
    #for k in dataloader_file:
    #    #print(k.size())
    #    preds=model.evaluate(k, word2id['<sos>'])
    #    #print(preds[:,1:].size())
    #    accuracy=torch.sum(preds[:,1:]==k)/(len(file)*10)
        #print('Accuracy {}, Label {}'.format(accuracy, label))
        
        


In [None]:
import numpy as np
import matplotlib.pyplot as plt
x=np.arange(1,len(df_test)+1)
plt.xlabel('Each file')
plt.ylabel('Accuency')
plt.plot(x,accs)

In [None]:
import time
start_time = time.time()
threshold=0.6
for i in range(len(accs)):
    accuracy=accs[i]
    if accuracy<threshold and label==1: #anomaly prediction
        TP+=1
    elif accuracy<threshold and label==0:
        FP+=1
    elif accuracy >threshold and label ==1:
        FN+=1
    else:
        TN+=1

print("Threshold is:",threshold)
print("True positive (anomaly with anomaly prediction):",TP)
print("False positive (normal with anomaly prediction): ",FP)

print("False negative (anomaly with normal prediction):",FN)
print("True negative (normal with normal prediction):",TN)
            
accuracy=(TP+TN)/(TP+TN+FP+FN)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
f1_score=(2*recall*precision)/(recall+precision)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1 score:",f1_score)
print("\n")
print("---The runing time is: %s seconds ---" % (time.time() - start_time))         
