In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchtext

from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from torch.nn.functional import binary_cross_entropy_with_logits, binary_cross_entropy
from torchmetrics import Accuracy, F1Score
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
from sklearn import model_selection

In [3]:
import os

In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [5]:
from dotenv import load_dotenv
load_dotenv()

os.chdir(os.getenv("PROJECT_ROOT_DIR"))
print(os.getcwd())

/root/malicious-code-detection


In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    VOCAB_SIZE = 0
    BATCH_SIZE = 2
    EMB_SIZE = 100
    OUT_SIZE = 2
    NUM_FOLDS = 5
    NUM_EPOCHS = 10
    NUM_WORKERS = 8
    # Whether to update the pretrained embedding weights during training process
    EMB_WT_UPDATE = True
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False    
    PATIENCE = 6    
    IS_BIDIRECTIONAL = True
    # model hyperparameters
    MODEL_PARAMS = {
        "hidden_size": 141, 
        "num_layers": 2,         
        "drop_out": 0.4258,
        "lr": 0.000366,
        "weight_decay": 0.00001
    }
    X_TEST_PATH = 'data/exp/test_set_token_types_corpus.txt'
    Y_TEST_PATH = 'data/exp/test_set_labels.txt'
    X_TRAIN_PATH = 'data/exp/train_set_token_types_corpus.txt'
    Y_TRAIN_PATH = 'data/exp/train_set_labels.txt' 
    
# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [8]:
def read_data(path):
	with open(path, "r") as f:
		data = f.readlines()
		vectors = [token.split() for token in data]
		return vectors

In [9]:
X_train = read_data(Config.X_TRAIN_PATH)
y_train_str = np.loadtxt(Config.Y_TRAIN_PATH, dtype=str)
y_train = np.where(y_train_str == 'goodjs', 0.0, 1.0)

X_test = read_data(Config.X_TEST_PATH)
y_test_str = np.loadtxt(Config.Y_TEST_PATH, dtype=str)
y_test = np.where(y_test_str == 'goodjs', 0.0, 1.0)

df_train = pd.DataFrame({'X': X_train, 'y': y_train})
df_train = df_train[df_train["X"].apply(len) != 0]
df_test = pd.DataFrame({'X': X_test, 'y': y_test})
df_test = df_test[df_test["X"].apply(len) != 0]

In [10]:
# take a sample of df_train and df_test only, stratify on y

df_train = df_train.sample(n=80, random_state=42)
df_test = df_test.sample(n=20, random_state=42)

In [11]:
unique_vocabs = set()
for x in X_train:
	unique_vocabs.update(x)

print(f"Unique vocab size: {len(unique_vocabs)}")

Unique vocab size: 9


In [12]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df   

In [13]:
df_train = strat_kfold_dataframe(df_train, target_col_name="y", num_folds=5)

In [14]:
FASTTEXT_EMB_FILE = "models/word_rep_by_types.vec"
emb = torchtext.vocab.Vectors(name=FASTTEXT_EMB_FILE, cache="./vector_cache")

In [15]:
def yield_tokens(df):
    for index, row in df.iterrows():
        yield row["X"]
    
ast_vocab = build_vocab_from_iterator(yield_tokens(df_train), specials=["<unk>", "<pad>"])   
Config.VOCAB_SIZE = len(ast_vocab)

In [16]:
def get_vocab_pt_emb_matrix(text_vocab, emb):
    embedding_matrix = []
    for token in text_vocab.get_itos():
        embedding_matrix.append(emb.get_vecs_by_tokens(token))
    return torch.stack(embedding_matrix)

pt_emb_weights = get_vocab_pt_emb_matrix(ast_vocab, emb)
pt_emb_layer = nn.Embedding.from_pretrained(pt_emb_weights)

In [17]:
df_train["vectorized_X"] = df_train["X"].apply(
    lambda row:torch.LongTensor(ast_vocab.lookup_indices(row))
    )

In [18]:
class JavaScriptASTDataset(Dataset):
    def __init__(self, ast_vecs, labels):
        self.ast_vecs = ast_vecs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        ast_vec = self.ast_vecs[idx]
        label = self.labels[idx]
        # ast_len = len(ast_vec)
        return (ast_vec, label)


In [19]:
def pad_collate(batch):
    # Each element in the batch is a tuple (data, label)
    # sort the batch (based on tweet word count) in descending order
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    # Also need to store the length of each sequence.This is later needed in order to unpad 
    # the sequences
    seq_len = torch.Tensor([len(x) for x in sequences])
    labels = torch.Tensor([x[1] for x in sorted_batch])
    return sequences_padded, seq_len, labels

def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    X_train = train_df["vectorized_X"].to_numpy()
    y_train = train_df["y"].to_numpy()
    X_valid = valid_df["vectorized_X"].to_numpy()
    y_valid = valid_df["y"].to_numpy()
    ds_train = JavaScriptASTDataset(X_train, y_train)
    ds_valid = JavaScriptASTDataset(X_valid, y_valid)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    dl_valid = DataLoader(ds_valid, batch_size=Config.BATCH_SIZE, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    return dl_train, dl_valid

In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class DisasterModel(nn.Module):
    """The RNN model."""
    def __init__(self, vocab_size, num_layers, is_bidirect, emb_size, hidden_size, output_size, 
                pt_emb_weights, emb_wt_update=False, drop_prob=0.5):
        super().__init__()        
        self.vocab_size = vocab_size
        self.num_layers = num_layers        
        # size of the embedding vector
        self.emb_size = emb_size
        self.hidden_size = hidden_size   
        self.output_dim = output_size
        self.is_bidirect = is_bidirect
        # Embedding layer
        self.emb_layer = nn.Embedding(self.vocab_size, emb_size)
        # copy the vocab specific weights(emb vectors) from pretrained embeddings to model embedding layer
        self.emb_layer.weight.data.copy_(pt_emb_weights)    
        # whether to update the pretrained embedding layer weights during model training
        self.emb_layer.weight.requires_grad = emb_wt_update            
        # LSTM Layer        
        self.lstm_layer = nn.LSTM(
                        input_size=emb_size, 
                        hidden_size=hidden_size, 
                        batch_first=True, 
                        bidirectional=is_bidirect, 
                        num_layers=num_layers, 
                        dropout=drop_prob
                        )
        self.dropout = nn.Dropout(p = drop_prob)                        
        
        # If the RNN is bidirectional `num_directions` should be 2, else it should be 1.        
        if not is_bidirect:
            self.num_directions = 1
            self.linear = nn.Linear(self.hidden_size, self.output_dim)
        else:       
            self.num_directions = 2     
            self.linear = nn.Linear(self.hidden_size * self.num_directions, self.output_dim)
        # The activation layer which converts output to 0 or 1            
        self.act = nn.Sigmoid()            

    def forward(self, inputs, input_lengths, state):        
        # inputs = [batch_size, batch_max_seq_length]        
        # embeds is of shape batch_size * num_steps * emb_dim and is the input to lstm layer
        embeds = self.emb_layer(inputs)        
        batch_size = inputs.shape[0]        
        # embeds = [batch_size, max_seq_length, emb_dim]
        # pack_padded_sequence before feeding into LSTM. This is required so pytorch knows
        # which elements of the sequence are padded ones and ignore them in computation.
        # This step is done only after the embedding step
        embeds_pack = pack_padded_sequence(embeds, input_lengths.to("cpu"), batch_first=True)                
        lstm_out_pack, (h_n, c_n) = self.lstm_layer(embeds_pack)
        # h_n and c_n = [num_directions * num_layers, batch_size, hidden_size]
        # unpack the output
        lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)        
        #print(f"lstm_out.shape = {lstm_out.shape}")
        #print(f"lstm_out_len.shape = {lstm_out_len.shape}")
        # lstm_out = [batch_size, max_seq_length, hidden_size * num_directions]
        if self.is_bidirect:            
            # each batch item has different seq length, so to select the hidden state at t_end for each batch item
            # a for comprehension like below is needed, a vectorized operation doesn't seem plausible
            #lstm_out = [lstm_out[batch_item_index, seq_length_index-1, :] for batch_item_index, seq_length_index in enumerate(lstm_out_len)]            
            #lstm_out = torch.cat(lstm_out, dim=0).reshape(batch_size, 4 * self.hidden_size)
            #print(f"lstm_out.shape = {lstm_out.shape}")
            # Another way to extract the last hidden state for the forward and backward lstm layers
            # in a BiRNN is to use h_n like this
            h_tend_fwd = h_n[-2, :, :]
            h_tend_bwd = h_n[-1, :, :]
            lstm_out = torch.cat((h_tend_fwd, h_tend_bwd), dim=1)
            #print(f"lstm_out.shape = {lstm_out.shape}")
        else:                        
            lstm_out = h_n[-1, :, :]                    
        
        out = self.dropout(lstm_out)                
        output = self.linear(out)        
        # apply sigmoid activation to convert output to probability 
        output = self.act(output)
        # [batch_size, 2]
        return output

    def init_state(self, batch_size=1):
        """ Initialize the hidden state i.e. initialize all the neurons in all the hidden layers 
        to zero"""
        if not isinstance(self.lstm_layer, nn.LSTM):
            # `nn.GRU` takes a tensor as hidden state
            return torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size))
        else:
            # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
            # hidden state for each element in the batch, c0 = initial cell state
            # for each element in the batch
            return (torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)),
                    torch.zeros((self.num_directions * self.num_layers,batch_size, self.hidden_size)))

In [21]:
class DisasterTweetLitModel(pl.LightningModule):
    def __init__(self, vocab_size, emb_size, output_size, pt_emb_weights, emb_wt_update, 
                hparams, model_eval_metric=MODEL_EVAL_METRIC.accuracy):
        super().__init__()
        #self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.weight_decay = hparams["weight_decay"]
        self.model_eval_metric = model_eval_metric
        self.network = DisasterModel(
            vocab_size = vocab_size,
            num_layers = hparams["num_layers"],
            is_bidirect = Config.IS_BIDIRECTIONAL,
            emb_size = emb_size,
            hidden_size = hparams["hidden_size"],
            output_size = output_size,
            pt_emb_weights = pt_emb_weights,
            emb_wt_update = emb_wt_update,
            drop_prob = hparams["drop_out"]
        )

    def forward(self, tweets, tweet_lengths, state):
        return self.network(tweets, tweet_lengths, state)

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, mode="min")
        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }

    def training_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        targets_pred = self(tweets, tweet_lengths, (h0, c0))        
        #print(f"targets_pred.shape = {targets_pred.shape}")
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        train_loss = binary_cross_entropy(targets_pred, loss_targets)
        train_metric = None
        train_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:            
            targets_pred = torch.argmax(targets_pred, dim=1)            
            train_metric = Accuracy(task="binary", num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            train_metric_str = "train_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            train_metric = F1(targets_pred, targets)            
            train_metric_str = "train_f1"
        self.log("train_loss", train_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(train_metric_str, train_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        targets_pred = self(tweets, tweet_lengths, (h0, c0))
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        val_loss = binary_cross_entropy(targets_pred, loss_targets)
        val_metric = None
        val_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:
            targets_pred = torch.argmax(targets_pred, dim=1)
            val_metric = Accuracy(task="binary", num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            val_metric_str = "val_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            val_metric = F1(targets_pred, targets)            
            val_metric_str = "val_f1"
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(val_metric_str, val_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return val_loss

In [22]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer
# Monitor multiple metric values that are calculated either in training or validation step and return the
# best metric values for each epoch
class MetricsAggCallback(Callback):
    def __init__(self, train_metrics_to_monitor, val_metrics_to_monitor):
        # dictionary with metric name as key and monitor mode (min, max) as the value
        # ( the same names used to log metric values in training and validation step)
        self.val_metrics_to_monitor = val_metrics_to_monitor
        self.train_metrics_to_monitor = train_metrics_to_monitor
        # dictionary with metric_name as key and list of metric value for each epoch
        self.train_metrics = {metric: [] for metric in train_metrics_to_monitor.keys()}
        self.val_metrics = {metric: [] for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the best metric value for all epochs
        self.train_best_metric = {metric: None for metric in train_metrics_to_monitor.keys()}
        self.val_best_metric = {metric: None for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the epoch number with the best metric value
        self.train_best_metric_epoch = {metric: None for metric in train_metrics_to_monitor.keys()}     
        self.val_best_metric_epoch = {metric: None for metric in val_metrics_to_monitor.keys()}     
        self.epoch_counter = 0           

    @staticmethod
    def process_metrics(metrics_to_monitor, metrics, best_metric, best_metric_epoch, trainer):
        metric_str = ""
        for metric, mode in metrics_to_monitor.items():
            metric_value = round(trainer.callback_metrics[metric].cpu().detach().item(), 4)            
            metric_str += f"{metric} = {metric_value}, "
            metrics[metric].append(metric_value)
            if mode == "max":
                best_metric[metric] = max(metrics[metric])            
            elif mode == "min":            
                best_metric[metric] = min(metrics[metric])            
            best_metric_epoch[metric] = metrics[metric].index(best_metric[metric]) 
        print(metric_str[:-2])

    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        self.epoch_counter += 1        
        self.process_metrics(self.train_metrics_to_monitor, self.train_metrics, self.train_best_metric, self.train_best_metric_epoch, trainer)

    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):        
        print(f"For epoch {self.epoch_counter}")
        self.process_metrics(self.val_metrics_to_monitor, self.val_metrics, self.val_best_metric, self.val_best_metric_epoch, trainer)

In [23]:
def run_training(fold, dl_train, dl_val, pt_emb_weights, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    disaster_tweet_model = DisasterTweetLitModel(
        vocab_size=Config.VOCAB_SIZE,
        emb_size=Config.EMB_SIZE,
        output_size=Config.OUT_SIZE,
        pt_emb_weights=pt_emb_weights,
        emb_wt_update=Config.EMB_WT_UPDATE,
        hparams=Config.MODEL_PARAMS,
        model_eval_metric=Config.MODEL_EVAL_METRIC                
        )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    chkpt_file_name = fold_str + "_best_model_{epoch}_{val_loss:.4f}"
    train_metrics_to_monitor = {
        "train_loss": "min",
        "train_acc": "max"
    }
    val_metrics_to_monitor = {
        "val_loss": "min",
        "val_acc": "max",
        }
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)    
    metric_chkpt_callback = MetricsAggCallback(train_metrics_to_monitor, val_metrics_to_monitor)
    early_stopping_callback = EarlyStopping(monitor="val_loss", patience=Config.PATIENCE, mode="min", verbose=True)
    trainer = pl.Trainer(
        # gpus = 1,
        accelerator="auto",
        deterministic = True,
        # auto_select_gpus = True,
        # progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        # auto_lr_find = True,    
        #precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        gradient_clip_val = 1.0,        
        callbacks = [loss_chkpt_callback, metric_chkpt_callback, early_stopping_callback]
    )        
    if find_lr:
        trainer.tune(model=disaster_tweet_model, train_dataloaders=dl_train)
        print(disaster_tweet_model.lr)
    trainer.fit(disaster_tweet_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    fold_train_metrics = {
        metric: (metric_chkpt_callback.train_best_metric[metric], metric_chkpt_callback.train_best_metric_epoch[metric]) 
        for metric in train_metrics_to_monitor.keys()
    }
    fold_val_metrics = {
        metric: (metric_chkpt_callback.val_best_metric[metric], metric_chkpt_callback.val_best_metric_epoch[metric]) 
        for metric in val_metrics_to_monitor.keys()
    }            
    best_model = loss_chkpt_callback.best_model_path
    del trainer, disaster_tweet_model, loss_chkpt_callback, metric_chkpt_callback 
    return fold_train_metrics, fold_val_metrics, best_model

In [24]:
find_lr = True
all_fold_val_loss = []
all_fold_val_acc = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val = get_fold_dls(fold, df_train)
    fold_train_metrics, fold_val_metrics, chkpt_file_name = run_training(fold, dl_train, dl_val, pt_emb_weights, find_lr=False)    
    all_fold_val_loss.append((fold_val_metrics["val_loss"][0], chkpt_file_name))
    all_fold_val_acc.append(fold_val_metrics["val_acc"][0])
    print(f"Best train metrics values for fold{fold}")    
    print(fold_train_metrics)
    print(f"Best val metrics values for fold{fold}")    
    print(fold_val_metrics)        

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Running training for fold0


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type          | Params
------------------------------------------
0 | network | DisasterModel | 755 K 
------------------------------------------
755 K     Trainable params
0         Non-trainable params
755 K     Total params
3.021     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  loss_targets = F.one_hot(targets.T.long(), num_classes=2)


For epoch 0
val_loss = 0.6946, val_acc = 0.5


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.689
Epoch 0, global step 32: 'val_loss' reached 0.68894 (best 0.68894), saving model to '/root/malicious-code-detection/model/fold0_best_model_epoch=0_val_loss=0.6889.ckpt' as top 1


For epoch 0
val_loss = 0.6889, val_acc = 0.5625
train_loss = 0.694, train_acc = 0.4844


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.688
Epoch 1, global step 64: 'val_loss' reached 0.68819 (best 0.68819), saving model to '/root/malicious-code-detection/model/fold0_best_model_epoch=1_val_loss=0.6882.ckpt' as top 1


For epoch 1
val_loss = 0.6882, val_acc = 0.5625
train_loss = 0.6845, train_acc = 0.6094


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.685
Epoch 2, global step 96: 'val_loss' reached 0.68546 (best 0.68546), saving model to '/root/malicious-code-detection/model/fold0_best_model_epoch=2_val_loss=0.6855.ckpt' as top 1


For epoch 2
val_loss = 0.6855, val_acc = 0.5625
train_loss = 0.6738, train_acc = 0.5469


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 128: 'val_loss' was not in top 1


For epoch 3
val_loss = 0.6942, val_acc = 0.5625
train_loss = 0.6696, train_acc = 0.5781


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 160: 'val_loss' was not in top 1


For epoch 4
val_loss = 0.7168, val_acc = 0.5625
train_loss = 0.6652, train_acc = 0.625


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 192: 'val_loss' was not in top 1


For epoch 5
val_loss = 0.7581, val_acc = 0.4375
train_loss = 0.6505, train_acc = 0.6562


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 224: 'val_loss' was not in top 1


For epoch 6
val_loss = 0.7054, val_acc = 0.5
train_loss = 0.6444, train_acc = 0.6562


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 256: 'val_loss' was not in top 1


For epoch 7
val_loss = 0.7519, val_acc = 0.4375
train_loss = 0.6558, train_acc = 0.6406


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.027 >= min_delta = 0.0. New best score: 0.658
Epoch 8, global step 288: 'val_loss' reached 0.65829 (best 0.65829), saving model to '/root/malicious-code-detection/model/fold0_best_model_epoch=8_val_loss=0.6583.ckpt' as top 1


For epoch 8
val_loss = 0.6583, val_acc = 0.6875
train_loss = 0.6168, train_acc = 0.6562


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.019 >= min_delta = 0.0. New best score: 0.640
Epoch 9, global step 320: 'val_loss' reached 0.63977 (best 0.63977), saving model to '/root/malicious-code-detection/model/fold0_best_model_epoch=9_val_loss=0.6398.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


For epoch 9
val_loss = 0.6398, val_acc = 0.625
train_loss = 0.6109, train_acc = 0.6406


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type          | Params
------------------------------------------
0 | network | DisasterModel | 755 K 
------------------------------------------
755 K     Trainable params
0         Non-trainable params
755 K     Total params
3.021     Total estimated model params size (MB)


Best train metrics values for fold0
{'train_loss': (0.6109, 9), 'train_acc': (0.6562, 5)}
Best val metrics values for fold0
{'val_loss': (0.6398, 10), 'val_acc': (0.6875, 9)}
Running training for fold1


Sanity Checking: 0it [00:00, ?it/s]

For epoch 0
val_loss = 0.6993, val_acc = 0.5


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.682
Epoch 0, global step 32: 'val_loss' reached 0.68227 (best 0.68227), saving model to '/root/malicious-code-detection/model/fold1_best_model_epoch=0_val_loss=0.6823.ckpt' as top 1


For epoch 0
val_loss = 0.6823, val_acc = 0.5
train_loss = 0.6918, train_acc = 0.5469


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.676
Epoch 1, global step 64: 'val_loss' reached 0.67551 (best 0.67551), saving model to '/root/malicious-code-detection/model/fold1_best_model_epoch=1_val_loss=0.6755.ckpt' as top 1


For epoch 1
val_loss = 0.6755, val_acc = 0.5
train_loss = 0.6885, train_acc = 0.5938


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.675
Epoch 2, global step 96: 'val_loss' reached 0.67467 (best 0.67467), saving model to '/root/malicious-code-detection/model/fold1_best_model_epoch=2_val_loss=0.6747.ckpt' as top 1


For epoch 2
val_loss = 0.6747, val_acc = 0.5625
train_loss = 0.6699, train_acc = 0.5781


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type          | Params
------------------------------------------
0 | network | DisasterModel | 755 K 
------------------------------------------
755 K     Trainable params
0         Non-trainable params
755 K     Total params
3.021     Total estimated model params size (MB)


Best train metrics values for fold1
{'train_loss': (0.6699, 2), 'train_acc': (0.5938, 1)}
Best val metrics values for fold1
{'val_loss': (0.6747, 3), 'val_acc': (0.5625, 3)}
Running training for fold2


Sanity Checking: 0it [00:00, ?it/s]

For epoch 0
val_loss = 0.675, val_acc = 1.0


Training: 0it [00:00, ?it/s]