# Sequence Learning of Smart Contract Security Exploits

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

# Input data files are available in the "./input_data/" directory.
import os
print(os.listdir("./input_data/final"))
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # specify which GPU(s) to be used
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.tools.plotting import table

### Define functions

In [None]:
def label(df):    
    # label data
    df['LABEL'] = 0
    df.loc[df['CATEGORY'] == '1 0 0 0', 'LABEL'] = 0
    df.loc[df['CATEGORY'] != '1 0 0 0', 'LABEL'] = 1
    
def nlp_preprocess(df):
    n_most_common_opcodes = 1000 #8000
    max_len = 130

    # Class Tokenizer - This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary)
    tokenizer = Tokenizer(num_words=n_most_common_opcodes, lower=False)

    # fit_on_texts - Updates internal vocabulary based on a list of texts. In the case where texts contains lists, we assume each entry of the lists to be a token.
    tokenizer.fit_on_texts(df['OPCODE'].values)

    # Transforms each text in texts in a sequence of integers.
    sequences = tokenizer.texts_to_sequences(df['OPCODE'].values)

    #Find number of unique tokens
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    #pad sequences with zeros in front to make them all maxlen
    X = pad_sequences(sequences, maxlen=max_len)
    
    return X

def dftoXY(df):
    # Save test X and y
    X_test = nlp_preprocess(df)
    # label data
    label(df)
    print(pd.value_counts(df['LABEL']))
    y_test = to_categorical(df['LABEL'], num_classes=2)
    
    return X_test, y_test    

### Loading the clean dataset

In [None]:
dataset = 'clean_train.csv'
data = pd.read_csv('./input_data/final/'+dataset, usecols=['ADDRESS', 'OPCODE', 'CATEGORY'])
data.head()

### Checking the category distribution

In [None]:
pd.value_counts(data['CATEGORY'])

### Loading saved model

In [None]:
# loading previously saved model
from keras.models import load_model
model = load_model('./saved_model/'+'9x_450k.h5')

### Divide data into two categories

In [None]:
n = data[data['CATEGORY'] == '1 0 0 0'] # no vulnerabilities
s = data[data['CATEGORY'] == '0 1 0 0'] # suicidal
p = data[data['CATEGORY'] == '0 0 1 0'] # prodigal
g = data[data['CATEGORY'] == '0 0 0 1'] # greedy
sp = data[data['CATEGORY'] == '0 1 1 0'] # suicidal and prodigal

# shuffle non-vulnerable set 
n_shuf = n.sample(frac=1, random_state=39, replace=False)

# split non-vulnerable data 
num_train = 450000 
num_test = len(n_shuf)-num_train
norm_train = n_shuf.iloc[0:num_train]
norm_test = n_shuf.iloc[num_train:]

n_shuf.head()

In [None]:
# increase vulnerable set by 9 times
multiple = 9 
pos = pd.concat([s,p,g,sp] * multiple) #8640

# concatenate vulnerable and non-vulnerable into one set
resampled_set = pd.concat([pos,norm_train], ignore_index=True)

# shuffle entire set
resampled_set = resampled_set.sample(frac=1, random_state=39, replace=False)
pd.value_counts(resampled_set['CATEGORY'])

In [None]:
# count resampled labels
label(resampled_set)
pd.value_counts(resampled_set['LABEL'])

### Preprocessing dataset

In [None]:
X, y = dftoXY(resampled_set)

# split processed dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

### Training LSTM Model

In [None]:
# Init
epochs = 50 
emb_dim = 128 
batch_size = 256 

In [None]:
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))
n_most_common_opcodes = 1000

model = Sequential()
# n_most_common_opcodes=Size of the directory, emb_dim=Dimension of the dense embedding, input_length=Length of input sequences, when it is constant
model.add(Embedding(n_most_common_opcodes, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.85))
model.add(LSTM(64, dropout=0.85, recurrent_dropout=0.85))
model.add(Dense(2, activation='sigmoid'))
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

import time
start_time = time.time()

# training the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2)

end_time = time.time()
print('Time taken: ', end_time-start_time)

### Test Set 

In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.4f}\n  Accuracy: {:0.4f}'.format(accr[0],accr[1]))

### Test FPs

In [None]:
# Loading and reading csv input data 
leakFP = 'clean_test_leakFP_noDups.csv'
suicidalFP = 'clean_test_suicidalFP_noDups.csv'
greedyFP = 'clean_test_greedyFP_noDups.csv'

leakFP_test = pd.read_csv('./input_data/'+leakFP, usecols=['ADDRESS', 'OPCODE', 'CATEGORY'])
suicidalFP_test = pd.read_csv('./input_data/'+suicidalFP, usecols=['ADDRESS', 'OPCODE', 'CATEGORY'])
greedyFP_test = pd.read_csv('./input_data/'+greedyFP, usecols=['ADDRESS', 'OPCODE', 'CATEGORY'])

# evaluation of FPs
X_leakFP, y_leakFP = dftoXY(leakFP_test)
leakFP_accr = model.evaluate(X_leakFP,y_leakFP)
print('Leak test set\n  Loss: {:0.4f}\n  Accuracy: {:0.4f}'.format(leakFP_accr[0],leakFP_accr[1]))

X_suicidalFP, y_suicidalFP = dftoXY(suicidalFP_test)
suicidalFP_accr = model.evaluate(X_suicidalFP,y_suicidalFP)
print('Suicidal test set\n  Loss: {:0.4f}\n  Accuracy: {:0.4f}'.format(suicidalFP_accr[0],suicidalFP_accr[1]))

X_greedyFP, y_greedyFP = dftoXY(greedyFP_test)
greedyFP_accr = model.evaluate(X_greedyFP,y_greedyFP)
print('Greedy test set\n  Loss: {:0.4f}\n  Accuracy: {:0.4f}'.format(greedyFP_accr[0],greedyFP_accr[1]))

### Sample 10,000 from non-vulnerable set not previously used for training or testing

In [None]:
# sample 10,000 contracts
num_tr = 10000
norm_test = norm_test.sample(num_tr, random_state=9, replace=False)

# shuffle these 10,000 samples
trtestset = pd.concat([norm_test], ignore_index=True)
trtestset = trtestset.sample(frac=1, random_state=39, replace=False)

# preprocess data
X_trtest, y_trtest = dftoXY(trtestset) 

# evaluate model on the sample
trtest_accr = model.evaluate(X_trtest,y_trtest)
print('Test accuracy on set of unseen negatives: {:0.4f}\n  Accuracy: {:0.4f}'.format(trtest_accr[0],trtest_accr[1]))

### Collecting and saving flagged negatives for validation

In [None]:
pred_neg = model.predict(X_trtest, verbose=1)

In [None]:
plt.hist(pred_neg[:,1])

predneg = np.count_nonzero(pred_neg[:,1] > 0.5)
predneg_accr = predneg / len(pred_neg)
print('Percentage of Negatives Flagged: {:0.4f}'.format(predneg_accr))

LSTM_pos = np.nonzero(pred_neg[:,1] > 0.5)
np.array(np.nonzero(pred_neg[:,1] > 0.5)).shape

lstm_Negflagged = norm_test.iloc[LSTM_pos]
# lstm_Negflagged.to_csv ('./output_data/lstm_flaggedNeg_final.csv', index = None, header=True)