# Import and connect to gg drive

In [1]:
# from google.colab import drive
# drive.mount("/content/drive")

In [2]:
# cd /content/drive/MyDrive/lab/

# Import package

In [3]:
import pandas as pd
import os
import collections
import numpy as np
import zipfile
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import *
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, FastText



In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/y_test.csv
/kaggle/input/X_test.csv
/kaggle/input/y_val.csv
/kaggle/input/y_train.csv
/kaggle/input/X_train.csv
/kaggle/input/X_val.csv


In [5]:
if torch.cuda.is_available():
 dev = "cuda:0"
else:
 dev = "cpu"
device = torch.device(dev)
device

device(type='cuda', index=0)

# Save classification report

In [6]:
def save_classification(y_test, y_pred, out_dir, labels):
  if isinstance(y_pred, np.ndarray) == False:
    y_pred = y_pred.toarray()

  def accuracy(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        numerator = sum(np.logical_and(y_true[i], y_pred[i]))
        denominator = sum(np.logical_or(y_true[i], y_pred[i]))
        if denominator != 0:
          temp += numerator / denominator
    return temp / y_true.shape[0]

  out = classification_report(y_test,y_pred, output_dict=True, target_names=labels)
  total_support = out['samples avg']['support']

  mr = accuracy_score(y_test, y_pred)
  acc = accuracy(y_test,y_pred)
  hm = hamming_loss(y_test, y_pred)

  out['Exact Match Ratio'] = {'precision': mr, 'recall': mr, 'f1-score': mr, 'support': total_support}
  out['Hamming Loss'] = {'precision': hm, 'recall': hm, 'f1-score': hm, 'support': total_support}
  out['Accuracy'] = {'precision': acc, 'recall': acc, 'f1-score': acc, 'support': total_support}
  out_df = pd.DataFrame(out).transpose()
  print(out_df)

  out_df.to_csv(out_dir)

  return out_df

# Tokenizer

In [7]:
class Tokenizer(object):
    def __init__(self, num_words=None, lower=True) -> None:
        self.word_index = {}
        self.word_counts = {}
        self.num_words = num_words
        self.split = " "
        self.lower = lower

    def fit_on_texts(self, texts):
        """
        create vocabulary

        Args:
            text: list of strings or list of list of strings
        """
        for text in texts:
            seq = self.text_to_word_sequence(text)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1

                else:
                    self.word_counts[w] = 1
        vocab = self.word_counts.keys()
        self.word_index = dict(zip(vocab, list(range(1, len(vocab) + 1))))

    def text_to_word_sequence(self, input_text):
        if self.lower == True:
            input_text = input_text.lower()

        seq = input_text.split(self.split)
        return seq

    def texts_to_sequences(self, texts):
        return list(self.texts_to_sequences_generator(texts))

    def texts_to_sequences_generator(self, texts):
        for text in texts:
            seq = self.text_to_word_sequence(text)
            vect = []
            for w in seq:
                i = self.word_index.get(w)
                vect.append(i)
            yield vect

def pad_sequences(
    sequences,
    maxlen=None,
    dtype="int32",
    padding="post",
    truncating="post",
    value=0.0
):
    """
    Args:
        sequences: List of sequences (each sequence is a list of integers).
        maxlen: Optional Int, maximum length of all sequences. If not provided,
            sequences will be padded to the length of the longest individual
            sequence.
        dtype: (Optional, defaults to `"int32"`). Type of the output sequences.
            To pad sequences with variable length strings, you can use `object`.
        padding: String, "pre" or "post" (optional, defaults to `"pre"`):
            pad either before or after each sequence.
        truncating: String, "pre" or "post" (optional, defaults to `"pre"`):
            remove values from sequences larger than
            `maxlen`, either at the beginning or at the end of the sequences.
        value: Float or String, padding value. (Optional, defaults to 0.)

    Returns:
        Numpy array with shape `(len(sequences), maxlen)`

    Raises:
        ValueError: In case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    """

    if not hasattr(sequences, "__len__"):
        raise ValueError("`sequences` must be iterable.")
    num_samples = len(sequences)

    lengths = []
    sample_shape = ()
    flag = True

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.

    for x in sequences:
        try:
            lengths.append(len(x))
            if flag and len(x):
                sample_shape = np.asarray(x).shape[1:]
                flag = False
        except TypeError as e:
            raise ValueError(
                "`sequences` must be a list of iterables. "
                f"Found non-iterable: {str(x)}"
            ) from e

    if maxlen is None:
        maxlen = np.max(lengths)

    is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(
        dtype, np.unicode_
    )
    if isinstance(value, str) and dtype != object and not is_dtype_str:
        raise ValueError(
            f"`dtype` {dtype} is not compatible with `value`'s type: "
            f"{type(value)}\nYou should set `dtype=object` for variable length "
            "strings."
        )

    x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == "pre":
            trunc = s[-maxlen:]
        elif truncating == "post":
            trunc = s[:maxlen]
        else:
            raise ValueError(f'Truncating type "{truncating}" not understood')

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                f"Shape of sample {trunc.shape[1:]} of sequence at "
                f"position {idx} is different from expected shape "
                f"{sample_shape}"
            )

        if padding == "post":
            x[idx, : len(trunc)] = trunc
        elif padding == "pre":
            x[idx, -len(trunc) :] = trunc
        else:
            raise ValueError(f'Padding type "{padding}" not understood')
    return x

# Create DataLoader

# Define Model

In [8]:
class Branch(nn.Module):
  def __init__(self, input_size, hidden_size, dropout, num_outputs):
    super(Branch, self).__init__()

    self.dense1 = nn.Linear(input_size, hidden_size)
    self.batchnorm1 = nn.BatchNorm1d(hidden_size)
    self.dropout = nn.Dropout(p=dropout)
    self.dense2 = nn.Linear(hidden_size, num_outputs)

  def forward(self, x):
    # print("Branch Input Shape:", x.shape)
    out_dense1 = self.dense1(x)
    # print("After Dense1 Shape:", out_dense1.shape)
    out_batchnorm1 = self.batchnorm1(out_dense1)
    out_dropout = self.dropout(out_batchnorm1)
    out_dense2 = self.dense2(out_dropout)

    return out_dense2

In [9]:
class Escort(nn.Module):
  def __init__(self, vocab_size, embedd_size, rnn_hidden_size, n_layers, num_classes, method_type, bidirectional, is_multibranches, num_auxiliary=None, auxiliary_feature_length=None):
    super(Escort, self).__init__()
    
    self.word_embeddings = nn.Embedding(vocab_size, embedd_size, padding_idx=0)
    self.bidirectional = bidirectional
    self.is_multibranches = is_multibranches
    if method_type=='GRU':
        self.rnn = nn.GRU(embedd_size, rnn_hidden_size, num_layers=n_layers, batch_first=True, bidirectional=self.bidirectional)
    else:
        self.rnn = nn.LSTM(embedd_size, rnn_hidden_size, num_layers=n_layers, batch_first=True, bidirectional=self.bidirectional)
    if num_auxiliary is not None and auxiliary_feature_length is not None:
        if num_auxiliary > 2 or num_auxiliary < 0:
            raise ValueError('num_auxiliary must be in (0,1,2)')
        rnn_hidden_size = rnn_hidden_size + num_auxiliary*auxiliary_feature_length
        
    if self.is_multibranches:
        self.branches = nn.ModuleList([Branch(rnn_hidden_size, 128, 0.2, 1) for _ in range(num_classes)])
    else:
        self.branch = Branch(rnn_hidden_size, 128, 0.2, num_classes)
        
    self.sigmoid = nn.Sigmoid()

  def forward(self, sequence, tfidf_features=None,word2vec=None):
    # print("Input to Escort:", sequence.shape)
    embeds = self.word_embeddings(sequence)
    rnn_out, hidden = self.rnn(embeds)
    
    if self.bidirectional:
        rnn_out = (rnn_out[:, :, :self.rnn_hidden_size] + rnn_out[:, :, self.rnn_hidden_size:])
        
    if tfidf_features is not None:
        rnn_out = torch.cat([rnn_out, tfidf_features])
    if word2vec is not None:
        rnn_out = torch.cat([rnn_out, word2vec])
    
    if self.is_multibranches:
        out_branch = [branch(rnn_out[:, -1, :]) for branch in self.branches]
        out_branch = torch.cat(output_branches, dim=1)
    else:
        out_branch = self.branch(rnn_out[:, -1, :])
    outputs = self.sigmoid(out_branch)
    return outputs

In [10]:
class OpcodeData(Dataset):
    def __init__(self, X, y, tokenizer, tfidf, w2v_model, max_len):
        self.tokenizer = tokenizer
        self.X = X.to_numpy()
        self.targets = y
        self.max_len = max_len
        self.tfidf = tfidf
        self.model = w2v_model
        
    def avg(self,text):
        for x in text:
            k = x.split()
        word_vectors = [self.model.wv[word] for word in k]
        return np.mean(word_vectors, axis=0)
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        text = self.X[index]
        word2vec = self.avg(text)
        ids = self.tokenizer.texts_to_sequences([text])[0]
        ids = pad_sequences([ids], maxlen=self.max_len)[0]
        tfidf_features = tfidf.transform([text]).toarray()[0]

        return {
            'index': index,
            'ids': torch.tensor(ids, dtype=torch.long),
            'tfidf_features': torch.tensor(tfidf_features, dtype=torch.float),
            'word2vec': torch.tensor(word2vec, dtype=torch.float),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [11]:
def get_misclassified_data(labels, preds, indices):
  misclassify_data = {}
  for i in range(len(labels)):
    is_append = False
    reject_label = np.array(labels[i])
    for j in range(len(labels[i])):
      if labels[i, j] != preds[i, j]:
        reject_label[j] = 2 # reject label
        is_append = True

    if is_append:
      x_train_index = indices[i]
      misclassify_data[x_train_index] = np.array(reject_label)
  return misclassify_data

# Trainer

## Train and Validation Steps

In [12]:
def train_steps(training_loader, model, loss_f, optimizer):
    print('Training...')
    training_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    train_acc = 0.
    train_f1 = 0.
    misclassify_train_data = {}

    model.train()

    for step, batch in enumerate(training_loader):
        # push the batch to gpu
        indices = batch['index'].numpy()
        tfidf_features = batch['tfidf_features'].to(device)
        ids = batch['ids'].to(device)
        word2vec = batch['word2vec'].to(device)
        targets = batch['targets'].to(device)

        preds, max_indices = model(ids, tfidf_features=tfidf_features,word2vec=word2vec)

        # calculate the loss for each branch
        losses = [loss_f(preds[i], targets[:, i]) for i in range(targets.shape[1])]
        average_loss = sum(losses) / targets.shape[1]
        training_loss += average_loss.item()

        label_ids = targets.to('cpu').numpy()
        max_indices = max_indices.detach().cpu().numpy()
        acc_score = accuracy_score(label_ids, max_indices)
        train_acc += acc_score

        misclassify_data = get_misclassified_data(label_ids, max_indices, indices)
        misclassify_train_data.update(misclassify_data)

        nb_tr_steps += 1

        optimizer.zero_grad()
        average_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # When using GPU
        optimizer.step()

    epoch_loss = training_loss / nb_tr_steps
    epoch_acc = train_acc / nb_tr_steps

    return epoch_loss, epoch_acc, misclassify_train_data


def evaluate_steps(validating_loader, model, loss_f):
    print("Evaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []
    total_labels = []
    # iterate over batches
    for step, batch in enumerate(validating_loader):
        # push the batch to gpu
        indices = batch['index'].numpy()
        tfidf_features = batch['tfidf_features'].to(device)
        ids = batch['ids'].to(device)
        word2vec = batch['word2vec'].to(device)
        targets = batch['targets'].to(device)
        
        # deactivate autograd
        with torch.no_grad():
            # model predictions
            preds, max_indices = model(ids, attention_mask=mask, token_type_ids=token_type_ids, tfidf_features=tfidf_features,word2vec=word2vec)

            # compute the validation loss between actual and predicted values
            losses = [loss_f(preds[i], targets[:, i]) for i in range(targets.shape[1])]
            average_loss = sum(losses) / targets.shape[1]
            total_loss += average_loss.item()

            max_indices = max_indices.detach().cpu().numpy()
            total_preds += list(max_indices)
            total_labels += targets.tolist()
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(validating_loader)
    acc_score = accuracy_score(total_labels, total_preds)

    return avg_loss, acc_score

## Training loop

In [13]:
def train(epochs, model, optimizer, criterion, dataloader, save_model_dir):
  data_train_loader, data_val_loader = dataloader
  # set initial loss to infinite
  best_valid_loss = float('inf')
  train_losses = []
  valid_losses = []
  train_accuracies = []
  valid_accuracies = []
  misclassify_train_data = {}
  total_time = 0.0

  for epoch in range(epochs):
    print('Epoch {}/{} '.format(epoch + 1, epochs))
    start_time = time.time()
    train_loss, train_acc, misclassify_train_steps_data = train_steps(data_train_loader, model, criterion, optimizer)
    valid_loss, valid_acc = evaluate_steps(data_val_loader, model, criterion)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_model_dir)
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracies.append(train_acc)
    valid_accuracies.append(valid_acc)
    misclassify_train_data.update(misclassify_train_steps_data)

    elapsed_time = time.time() - start_time
    total_time += elapsed_time

    print('\t loss={:.4f} \t accuracy={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(train_loss, train_acc, valid_loss, valid_acc, elapsed_time))
  print(f'Total time: {total_time}')
  return train_accuracies, valid_accuracies, train_losses, valid_losses, misclassify_train_data

In [14]:
def plot_graph(epochs, train, valid, tittle):
    fig = plt.figure(figsize=(12,12))
    plt.title(tittle)
    plt.plot(list(np.arange(epochs) + 1) , train, label='train')
    plt.plot(list(np.arange(epochs) + 1), valid, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(loc='best')


## Test Model

In [15]:
def predict(testing_loader, model):
    print("\nPredicting...")
    # deactivate dropout layers
    model.eval()

    # empty list to save the model predictions
    total_preds = []
    total_labels = []
    # iterate over batches
    for step, batch in enumerate(testing_loader):
        # push the batch to gpu
        indices = batch['index'].numpy()
        tfidf_features = batch['tfidf_features'].to(device)
        ids = batch['ids'].to(device)
        word2vec = batch['word2vec'].to(device)
        targets = batch['targets'].to(device)

        # deactivate autograd
        with torch.no_grad():
            # model predictions
            preds, max_indices = model(ids, attention_mask=mask, token_type_ids=token_type_ids, tfidf_features=tfidf_features,word2vec=word2vec)

            max_indices = max_indices.detach().cpu().numpy()
            total_preds += list(max_indices)
            total_labels += targets.tolist()

    return total_labels, total_preds

## Run

In [16]:
data_folder='/kaggle/input/'
out_folder ='/kaggle/working/'
# Define constant
input_size = 4100
epochs = 1
SIZE_OF_VOCAB = 512
EMBEDDED_SIZE = 5
GRU_HIDDEN_SIZE = 256
NUM_OUTPUT_NODES = 4
NUM_LAYERS = 1
DROPOUT = 0.2
RNN_TYPE = 'GRU'
BIDIRECTIONAL = True
USE_MULTIBRANCHES = True
LEARNING_RATE=1e-3
labels = ['Timestamp dependence', 'Outdated Solidity version', 'Frozen Ether', 'Delegatecall Injection']
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
save_model_dir = out_folder + 'escort-tfidf-w2v.pt'
report_dir = out_folder + 'escort-tfidf-w2v.csv'
tokenizer_dir = out_folder + 'tokenizer.pickle'
w2v_dir = out_folder + 'fasttext_w2v.pickle'
tfidf_dir = out_folder + 'tfidf.pickle'

In [17]:

# data_folder = os.getcwd()+'/Wisdomnet/Untitled Folder/'
# out_folder = os.getcwd()+'/trained/'
# Read data
X_train = pd.read_csv(data_folder+'X_train.csv')
X_test = pd.read_csv(data_folder+'X_test.csv')
X_val = pd.read_csv(data_folder+'X_val.csv')

y_train = pd.read_csv(data_folder+'y_train.csv').to_numpy()
y_test = pd.read_csv(data_folder+'y_test.csv').to_numpy()
y_val = pd.read_csv(data_folder+'y_val.csv').to_numpy()

In [18]:
# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts=np.copy(X_train['BYTECODE'].to_numpy()))

In [19]:
tfidf = TfidfVectorizer(max_features=256)
tfidf_features = tfidf.fit_transform(np.copy(X_train['BYTECODE'].to_numpy()))

In [20]:
splits = []
for sentence in X_train['BYTECODE'].tolist():
    for x in sentence:
        l = x.split(' ')
    splits.append(l)

In [21]:
word2vec = FastText(splits, min_count=1, window=7,vector_size=256, epochs=10)

In [22]:
def save_model(model, save_model_dir):
    try:
        with open(save_model_dir, 'wb') as file:
            pickle.dump(model, file)
        print(f'save successfully model with {save_model_dir}')

    except:
        print(f'can\'t save model with {save_model_dir}')

def load_model(save_model_dir):
    with open(save_model_dir, 'rb') as file:
        model = pickle.load(file)

    return model

In [23]:
# save_model(tokenizer, tokenizer_dir)
# save_model(tfidf, tfidf_dir)
# save_model(word2vec, w2v_dir)

In [24]:
# word2vec = FastText(vector_size=256, window=7, min_count=1, sentences=splits, epochs=10, min_count=1)

In [25]:
train_dataset = OpcodeData(X=X_train, y=y_train, tokenizer=tokenizer, tfidf=tfidf, w2v_model=word2vec, max_len=input_size)
test_dataset = OpcodeData(X=X_test, y=y_test, tokenizer=tokenizer, tfidf=tfidf, w2v_model=word2vec, max_len=input_size)
val_dataset = OpcodeData(X=X_val, y=y_val, tokenizer=tokenizer, tfidf=tfidf, w2v_model=word2vec, max_len=input_size)

In [26]:
# Create generator for Dataset with BATCH_SIZE
training_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE)
validating_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE)
testing_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE)

In [27]:
# a = next(iter(validating_loader))

In [28]:
# Define model
model = Escort(SIZE_OF_VOCAB, EMBEDDED_SIZE, GRU_HIDDEN_SIZE, NUM_LAYERS, NUM_OUTPUT_NODES, RNN_TYPE, BIDIRECTIONAL, USE_MULTIBRANCHES)
model.to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()

Escort(
  (word_embeddings): Embedding(512, 5, padding_idx=0)
  (rnn): GRU(5, 256, batch_first=True, bidirectional=True)
  (branches): ModuleList(
    (0-3): 4 x Branch(
      (dense1): Linear(in_features=256, out_features=128, bias=True)
      (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (dense2): Linear(in_features=128, out_features=1, bias=True)
    )
  )
  (sigmoid): Sigmoid()
)


In [29]:
# Train model
train_accuracies, valid_accuracies, train_losses, valid_losses = train(epochs, model, optimizer, criterion, save_model_dir)

TypeError: train() missing 1 required positional argument: 'save_model_dir'

In [None]:
# Plot graph
plot_graph(epochs, train_losses, valid_losses, "Train/Validation Loss")
plot_graph(epochs, train_accuracies, valid_accuracies, "Train/Validation Accuracy")

In [None]:
# Test model
total_preds, total_labels, execution_time = predict(data_test_loader, model, criterion)

In [None]:
print(execution_time)

In [None]:
save_classification(y_pred=np.array(total_preds), y_test=np.array(total_labels), labels=labels, out_dir=report_dir)