In [3]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# My folder structure

<center>
  <img src="https://i.imgur.com/e6PWBAL.png" alt="folder-structure">
</center>

> Note: `Testing Samples` is a copy folder from 'Benign PE Samples' which is put in "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/". It has been used below. This picture did not specify that.

# Ex1: Obfuscated Javascript Files Detection

Importing the libraries

In [None]:
import os
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

Specify the paths of **obfuscated** and **normal (non-obfuscated)** JavaScript files. Then labeling them.

In [None]:
js_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/JavascriptSamples"
obfuscated_js_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/JavascriptSamplesObfuscated"
corpus = []
labels = []
file_types_and_labels = [(js_path, 0), (obfuscated_js_path, 1)]

- Corpus: Containing the content of each file 
- Label: label of each file

In [None]:
for files_path, label in file_types_and_labels:
  files = os.listdir(files_path)
  print(files)
  for file in files:
    file_path = files_path + "/" + file
    try:
      with open(file_path, "r") as myfile:
        data = myfile.read().replace("\n", "")
        data = str(data)
        corpus.append(data)
        labels.append(label)
    except:
      pass

['angular-locale_lrc-obfuscated.js', 'angular-locale_lt-lt-obfuscated.js', 'angular-locale_lt-obfuscated.js', 'angular-locale_lu-cd-obfuscated.js', 'angular-locale_luo-ke-obfuscated.js', 'angular-locale_lu-obfuscated.js', 'angular-locale_luo-obfuscated.js', 'angular-locale_luy-ke-obfuscated.js', 'angular-locale_luy-obfuscated.js', 'angular-locale_lv-lv-obfuscated.js', 'angular-locale_mas-ke-obfuscated.js', 'angular-locale_lv-obfuscated.js', 'angular-locale_mas-tz-obfuscated.js', 'angular-locale_mas-obfuscated.js', 'angular-locale_mer-ke-obfuscated.js', 'angular-locale_mer-obfuscated.js', 'angular-locale_mfe-mu-obfuscated.js', 'angular-locale_mfe-obfuscated.js', 'angular-locale_mg-mg-obfuscated.js', 'angular-locale_mg-obfuscated.js', 'angular-locale_mgh-mz-obfuscated.js', 'angular-locale_mgh-obfuscated.js', 'angular-locale_mgo-cm-obfuscated.js', 'angular-locale_mk-mk-obfuscated.js', 'angular-locale_mgo-obfuscated.js', 'angular-locale_mk-obfuscated.js', 'angular-locale_ml-in-obfuscated.j

In [None]:
# Split dataset 
X_train, X_test, y_train, y_test = train_test_split(
  corpus, 
  labels, 
  test_size=0.33, 
  random_state=42
)

Use Pipeline to perform **NLP (Natural Language Processing)** and **Random Forest** classifier

In [None]:
text_clf = Pipeline(
  [
    ("vect", HashingVectorizer(
        input="content", 
        ngram_range=(1, 3)
        )
    ), # Return a sparse matrix. Use 'Hashing Vectorizer'. This process is done by: 
    # Evaluate the occurence of each token -> Use a hash function to hash the value. The purpose is to find the index for that one
    # -> Calculate the probability of each occurence. This would output a matrix.
    # The main advantage is the fixed output array, as it does not require to store the hashed value of each token.
    # Reference: https://www.dictionary4it.com/term/feature-hashing-6717/ (Note: This is just a simulation for the above process.)

    ("tfidf", TfidfTransformer(use_idf=True,)), # Calcute the importance of a token in a document. For more details: https://viblo.asia/p/tf-idf-term-frequency-inverse-document-frequency-JQVkVZgKkyd
    ("rf", RandomForestClassifier(class_weight="balanced")), # Use 'Random Forest' algorithm for training
  ]
)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
X.data

array([-0.57735027, -0.57735027,  0.57735027,  0.        , -0.81649658,
        0.40824829,  0.40824829,  0.        , -0.70710678,  0.70710678,
        0.        ,  0.        , -0.57735027, -0.57735027,  0.57735027,
        0.        ])

Fit dataset to the *Pipeline*

In [None]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', HashingVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('rf', RandomForestClassifier(class_weight='balanced'))])

Evaluate the training result

In [None]:
y_test_pred = text_clf.predict(X_test)
print("Accuracy Score: %s" % accuracy_score(y_test, y_test_pred))
print("Confusion matrix: \n %s" % confusion_matrix(y_test, y_test_pred))

Accuracy Score: 0.9649910233393177
Confusion matrix: 
 [[609  26]
 [ 13 466]]


# Ex2: Extracting features from PDF files

> *PDFID*: Essentially, the tool *scans through a PDF file*, and counts **the number of occurrences** of each of the **~20 features**

In [59]:
!pip uninstall pdfid # This line can be omitted. As we install 'pdfid' manually



IPython's io module so as to capture the output of an external script

In [80]:
from IPython.utils import io # import lib

In [121]:
def PDF_to_FV(file_path):
  """Featurize a PDF file using pdfid."""
  with io.capture_output() as captured:
    %run -i pdfid $file_path 
    # '%run' is a magic function of the loaded module. It execute `pdfid` command syntax with '-i' option to ignore 'sys.exit()'
    # More reference at: https://ipython.org/ipython-doc/dev/interactive/magics.html#magic-run
  print(captured) 
  out = captured.stdout
  out1 = out.split("\n")[2:-2]
  return [int(x.split()[-1]) for x in out1]

In [112]:
# định nghĩa hàm để featurize PDF 
def PDF_to_FV(file_path):
  """Featurize a PDF file using pdfid.""" 
  # Run 'pdfid' trên một file và capture đầu ra của nó 
  with io.capture_output() as captured:
    %run -i pdfid $file_path
  print(captured)
  out = captured.stdout
  out1 = out.split("\n")[2:-2]
  return [int(x.split()[-1]) for x in out1]

In [103]:
from os import listdir 
PDFs_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/PDFSamples/"

In [104]:
%cd "/content/drive/MyDrive/Shared Drive/Lab3/Tool/"
!ls

/content/drive/MyDrive/Shared Drive/Lab3/Tool
pdfid.py      upx-4.0.0-amd64_linux.tar.xz  upx-4.0.0-win64
pdfid_v0_2_8  upx-4.0.0-linux


In [105]:
!pwd

/content/drive/MyDrive/Shared Drive/Lab3/Tool


Loop through each file in the folder 'PDFSamples', featurize it, and then collect all the feature vectors into X

In [123]:
X = []
files = listdir(PDFs_path)
print(files)
for file in files:
  try:
    print(PDFs_path)
    file_path = "\"" + PDFs_path + file + "\""
    print(file_path)
    X.append(PDF_to_FV(file_path))
  except(e):
    print(e)
print(X)

['pdfDOCS_User_Reference_Guide-1.pdf', 'PythonBrochure.pdf']
/content/drive/MyDrive/Shared Drive/Lab3/Dataset/PDFSamples/
"/content/drive/MyDrive/Shared Drive/Lab3/Dataset/PDFSamples/pdfDOCS_User_Reference_Guide-1.pdf"
PDFiD 0.2.8 /content/drive/MyDrive/Shared Drive/Lab3/Dataset/PDFSamples/pdfDOCS_User_Reference_Guide-1.pdf
 PDF Header: %PDF-1.6
 obj                  153
 endobj               153
 stream                82
 endstream             82
 xref                   2
 trailer                2
 startxref              2
 /Page                  7
 /Encrypt               0
 /ObjStm                0
 /JS                    0
 /JavaScript            0
 /AA                    0
 /OpenAction            0
 /AcroForm              2
 /JBIG2Decode           0
 /RichMedia             0
 /Launch                0
 /EmbeddedFile          0
 /XFA                   0
 /Colors > 2^24         0


/content/drive/MyDrive/Shared Drive/Lab3/Dataset/PDFSamples/
"/content/drive/MyDrive/Shared Drive/Lab3/D

# Ex3: Extracting N-grams by using hash-gram

Import modules

In [None]:
from os import listdir
from nltk import ngrams
import hashlib

Specify **Benign** and **Malicious** directories. Set 'N_gram'

In [None]:
directories = ["/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Benign PE Samples", "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Malicious PE Samples"]
N_gram = 2

Define function as the same as **LAB2**. The comments are also clear.

In [None]:
def read_file(file_path):
  """Reads in the binary sequence of a binary file."""
  with open(file_path, "rb") as binary_file:
    data = binary_file.read()
    return data
def byte_sequence_to_Ngrams(byte_sequence, N_gram):
  """Creates a list of N-grams from a byte sequence."""
  return ngrams(byte_sequence, N_gram)

In [None]:
def hash_input(inp):
  """Compute the MD5 hash of an input."""
  return int(hashlib.md5(inp).hexdigest(), 16)
  
def make_ngram_hashable(Ngram):
  """Convert N-gram into bytes to be hashable."""
  return bytes(Ngram)

In [None]:
def hash_file_Ngrams_into_dictionary(file_Ngrams, T):
  """Hashes N-grams in a list and then keeps track of the counts in a
  dictionary."""
  for Ngram in file_Ngrams: # loop through N grams in that file. Then hash each grams
    hashable_Ngram = make_ngram_hashable(Ngram)
    hashed_and_reduced = hash_input(hashable_Ngram) % B_prime_num # Create index for each grams by '%" operator. B_prime_num: Limit the number of keys in the dictionary
    T[hashed_and_reduced] = T.get(hashed_and_reduced, 0) + 1 # Save "index": "value" key pairs.

In [None]:
B_prime_num = 65521 # B_prime_num < 2^16, a.k.a modulo B (%B). This parameter ensures that there can be no more than B keys in the dictionary:
T_dict = {} 

# Iterate over the directories, over each files and save to a dictionary `T_dict`
for dataset_path in directories:
  samples = [f for f in listdir(dataset_path)]
  for file in samples:
    file_path = dataset_path + "/" + file
    file_byte_sequence = read_file(file_path)
    file_Ngrams = byte_sequence_to_Ngrams(file_byte_sequence, N_gram)
    hash_file_Ngrams_into_dictionary(file_Ngrams, T_dict)

In [None]:
import heapq

In [None]:
K1 = 1000
K1_most_common_Ngrams_Using_Hash_Grams = heapq.nlargest(K1, T_dict)

In [None]:
def featurize_sample(file, K1_most_common_Ngrams_Using_Hash_Grams):
  """Takes a sample and produces a feature vector.
  The features are the counts of the K1 N-grams we've selected.
  """
  K1 = len(K1_most_common_Ngrams_Using_Hash_Grams)
  fv = K1 * [0]
  file_byte_sequence = read_file(file_path)
  file_Ngrams = byte_sequence_to_Ngrams(file_byte_sequence, N_gram)
  for Ngram in file_Ngrams:
    hashable_Ngram = make_ngram_hashable(Ngram)
    hashed_and_reduced = hash_input(hashable_Ngram) % B_prime_num
    if hashed_and_reduced in K1_most_common_Ngrams_Using_Hash_Grams:
      index = K1_most_common_Ngrams_Using_Hash_Grams.index(hashed_and_reduced)
      fv[index] += 1
  return fv

Extracting *N-gram* over each file

In [None]:
X = [] 
for dataset_path in directories: 
  samples = [f for f in listdir(dataset_path)] 
for file in samples: 
  file_path = dataset_path + "/" + file 
  X.append(featurize_sample(file_path, K1_most_common_Ngrams_Using_Hash_Grams))

In [None]:
X # Result should be 1000

768

# Ex4: Building Dynamic Malicious-File Classifier

> Parsing log data files which are extracted by **dynamic analysis**

>  The dynamic analysis was performed on several **LG Nexus 5 devices** with Android **API 23**, (over **4,000 malicious apps**)

In [None]:
import numpy as np
import os
import json
directories_with_labels = [("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/DA Logs Benign", 0),
                           ("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/DA Logs Malware", 1)]

In [None]:
def get_API_class_method_type_from_log(log):
  """Parses out API calls from behavioral logs."""
  API_data_sequence = []
  with open(log) as log_file:
    json_log = json.load(log_file) # Parse log file's content and handle it as JSON format. <`load` is applied for *text files* or binary files>
    api_calls_array = "[" + json_log["api_calls"] + "]" # Get value from 'api_calls' property from JSON Object. It is also a JSON Object.
    api_calls = json.loads(api_calls_array) # Parse object above and handle it as JSON format. <`loads` supports a str, bytes or bytearray instance>
    for api_call in api_calls:
      data = api_call["class"] + ":" + api_call["method"] + ":" + api_call["type"] # Only do care of values of `class`, `method` and `type` properties extracted dynamically
      API_data_sequence.append(data)
    return API_data_sequence

Specify `corpus` and `label`

In [None]:
data_corpus = []
labels = []

Loop over each file to get their API information

In [None]:
for directory, label in directories_with_labels:
  logs = os.listdir(directory)
  for log_path in logs:
    file_path = directory + "/" + log_path
    try:
      data_corpus.append(get_API_class_method_type_from_log(file_path))
      labels.append(label)
    except:
      pass
# print(data_corpus[0])

Split our dataset

In [None]:
from sklearn.model_selection import train_test_split

corpus_train, corpus_test, y_train, y_test = train_test_split(
  data_corpus, 
  labels, 
  test_size=0.2, 
  random_state=11
)

In [None]:
import collections
from nltk import ngrams
import numpy as np

Defining function to get *N-grams* for a specific file 

In [None]:
def read_file(file_path):
  """Reads in the binary sequence of a binary file."""
  with open(file_path, "rb") as binary_file:
    data = binary_file.read()
  return data
  
def text_to_Ngrams(text, n):
  """Produces a list of N-grams from a text."""
  Ngrams = ngrams(text, n)
  return list(Ngrams)

def get_Ngram_counts(text, N):
  """Get a frequency count of N-grams in a text."""
  Ngrams = text_to_Ngrams(text, N)
  return collections.Counter(Ngrams)

Collect all *N-grams*

In [None]:
N_gram = 4
total_Ngram_count = collections.Counter([]) 
for file in corpus_train:
  total_Ngram_count += get_Ngram_counts(file, N_gram)

Cut down to the `K1 = 3000` most frequent N-grams and save them into an array

In [None]:
K1 = 3000
K1_most_frequent_Ngrams = total_Ngram_count.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [None]:
def featurize_sample(file, Ngrams_list):
  """Takes a sample and produces a feature vector.
  The features are the counts of the K1 N-grams we've selected.
  """
  K1 = len(Ngrams_list)
  feature_vector = K1 * [0]
  fileNgrams = get_Ngram_counts(file, N_gram)
  for i in range(K1):
    feature_vector[i] = fileNgrams[Ngrams_list[i]]
  return feature_vector

# The output vector is an array whose each value is the most common 'N-grams' of that file, 
# chosen from `K1_most_frequent_Ngrams_list`  as `Ngrams_list` parameters that is extracted by whole dataset before


Construct the **(train, test)** data with the idea above

In [None]:
X_train = []
for sample in corpus_train:
  X_train.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
X_train = np.asarray(X_train)

X_test = []
for sample in corpus_test:
  X_test.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
X_test = np.asarray(X_test)

Prepare library for training 

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

Construct a *Pipeline*. Related reference:
- SelectKBest: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
- mutual_info_classif: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html
- XGBClassifier: https://ongxuanhong.wordpress.com/2017/12/21/xgboost-thuat-toan-gianh-chien-thang-tai-nhieu-cuoc-thi-kaggle/

In [None]:
K2 = 500
mi_pipeline = Pipeline(
  [
    ("mutual_information", SelectKBest(mutual_info_classif, k=K2)),
   # mutual_info_classif: Estimate mutual information for a discrete target variable 
   # SelectKBest: Select features according to the k highest scores. k: Number of top features to select, in this case is `K2`
    ("xgb", XGBClassifier()), # Use 'XGBoost' algorithm
  ]
)

In [None]:
mi_pipeline.fit(X_train, y_train)

Pipeline(steps=[('mutual_information',
                 SelectKBest(k=500,
                             score_func=<function mutual_info_classif at 0x7f7c10db6ef0>)),
                ('xgb', XGBClassifier())])

In [None]:
print("Training accuracy:")
print(mi_pipeline.score(X_train, y_train))
print("Testing accuracy:")
print(mi_pipeline.score(X_test, y_test))

Training accuracy:
0.8156945279615153
Testing accuracy:
0.7919422730006013


# Ex5: MalConv – Process in applying Deep Learning for Malicious PE file Detection

<center>
  <img src="https://i.imgur.com/svjz9wF.png" alt="model">
</center>

In [None]:
import numpy as np
from tqdm import tqdm # Keep track of progress in our loops

Defining **embedded function** with the formula as follows:

Use a simple embedding of bytes in an 8-
dimensional space, in which each bit of the byte corresponds to a coordinate of the vector. A bit equal to `1` means that the corresponding coordinate is set to `1/16`, whereas a
bit value of `0` corresponds to a coordinate equal to `-1/16`

In [None]:
def embed_bytes(byte):
  binary_string = "{0:08b}".format(byte) # Convert bytes to binary format. Example with 1 byte: 00001110(14)
  vec = np.zeros(8) # Create a 1-D array comprising of '0's 
  for i in range(8): 
    if binary_string[i] == "1":
      vec[i] = float(1) / 16
    else:
      vec[i] = -float(1) / 16
  return vec

In [None]:
import os
from os import listdir

In [None]:
directories_with_labels = [("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Benign PE Samples", 0), 
                           ("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Malicious PE Samples", 1)]
list_of_samples = []
labels = []

*Assign label* for each specific file

In [None]:
for dataset_path, label in directories_with_labels:
  samples = [f for f in listdir(dataset_path)]
  for file in samples:
    file_path = os.path.join(dataset_path, file)
    list_of_samples.append(file_path)
    labels.append(label)

In [None]:
def read_file(file_path):
  """Read the binary sequence of a file."""
  with open(file_path, "rb") as binary_file:
    return binary_file.read()

Set a maximum length `maxSize` of bytes to read in per sample, then embedding all the
bytes of the samples, and gather the result in `X`:

In [None]:
max_size = 15000
num_samples = len(list_of_samples)
X = np.zeros((num_samples, 8, max_size))
Y = np.asarray(labels)
file_num = 0

In [None]:
for file in tqdm(list_of_samples):
  sample_byte_sequence = read_file(file)
  for i in range(min(max_size, len(sample_byte_sequence))):
    X[file_num, :, i] = embed_bytes(sample_byte_sequence[i])
  file_num += 1

100%|██████████| 271/271 [00:27<00:00,  9.68it/s]


Add `optimizer` function, use **SGD algorithm**

> Note: Those hyperparameters are often determined by their popularity or using tools

In [None]:
from keras import optimizers

my_opt = optimizers.SGD(lr=0.01, decay=1e-5, nesterov=True)

# ========== EXPLAIN SOME HYPER-PARAMETER ===========

# lr (Learning rate): Learning rate is understood as a proportional part of a model weight shift that is updated according to the passed mini-batches. 
# The magnitude of the learning rate will directly affect the convergence rate (our resuilt) of the loss function to the global 'extreme point'.

# decay: is a part of `Learning rate`. It is a factor that helps to suppress the magnitude of the learning rate over training time in order to avoid 
# their values ​​being too high at the stage when the model weights have come to convergence.

# nesterov: is another factor which speeds up convergence rate and our result.

  super(SGD, self).__init__(name, **kwargs)


Use **Keras functional API** to set up the deep neural network architecture

Some referenced links about parameters:
- http://itechseeker.com/tutorials/nlp-with-deep-learning/thuc-hanh-viet-ung-dung/phan-loai-bai-viet-voi-cnn/
- https://viblo.asia/p/ung-dung-convolutional-neural-network-trong-bai-toan-phan-loai-anh-4dbZNg8ylYM
- Conv1D layer: https://keras.io/api/layers/convolution_layers/convolution1d/

In [None]:
from keras import Input
inputs = Input(shape=(8, max_size)) # shape: specify the number `n` that the expected input will be batches of n-dimensional vectors.

from keras.layers import Conv1D
conv1 = Conv1D(kernel_size=(128), # Size of kernel, this will be the height for Filter
                filters=32, # number of filter
                strides=(128), # distance between 2 kernels when scanning. In other words, it modifies the amount of movement over the image or video
                padding="same" #  the amount of pixels added to an matrix to be suitable when this matrix being processed by the kernel of a CNN 
                # 'same' means padding with zeros evenly to the left/right or up/down of the input
              )(inputs)
conv2 = Conv1D(kernel_size=(128), 
                filters=32, 
                strides=(128),
                padding="same"
              )(inputs)

from keras.layers import Activation
a = Activation("sigmoid", name="sigmoid")(conv2) # Use 'sigmoid' as activate function for this layer

from keras.layers import multiply
mul = multiply([conv1, a]) # Apply multiply operator (*) to 'conv1' and 'a'
b = Activation("relu", name="relu")(mul) # Use 'relu' as activate function for this layer

from keras.layers import GlobalMaxPool1D
p = GlobalMaxPool1D()(b) # Global 'max pooling' operation for 1D temporal data.
 
from keras.layers import Dense
d = Dense(16)(p) # Dense layer, number of neurons
predictions = Dense(1, activation="sigmoid")(d)

from keras import Model
model = Model(inputs=inputs, outputs=predictions) # Build a model with 'inputs' and 'predictions'

In [None]:
model.compile(optimizer=my_opt, loss="binary_crossentropy", metrics=["acc"]) # Train the model, use 'binary_crossentropy' as a loss function
model.summary()

batch_size = 16 # the number of data samples in one training session.
num_batches = int(num_samples / batch_size)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 8, 15000)]   0           []                               
                                                                                                  
 conv1d_2 (Conv1D)              (None, 1, 32)        61440032    ['input_2[0][0]']                
                                                                                                  
 conv1d_1 (Conv1D)              (None, 1, 32)        61440032    ['input_2[0][0]']                
                                                                                                  
 sigmoid (Activation)           (None, 1, 32)        0           ['conv1d_2[0][0]']               
                                                                                              

In [None]:
# Show progress
for batch_num in tqdm(range(num_batches)):
  batch = X[batch_num * batch_size : (batch_num + 1) * batch_size]
  model.train_on_batch(
    batch, Y[batch_num * batch_size : (batch_num + 1) * batch_size]
  )

100%|██████████| 16/16 [00:49<00:00,  3.07s/it]


In [None]:
# Evaluation
print(model.evaluate(X, Y))

[0.4850543737411499, 0.8523985147476196]


# Bonus: Other project

In [14]:
# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [15]:
# load the dataset
dataset = loadtxt('/content/drive/MyDrive/Shared Drive/Lab3/Dataset/pima-indians-diabetes.data.csv', delimiter=',')
# split into input (X) and output (y) variables
X = dataset[:,0:8]
y = dataset[:,8]

In [16]:
# define the keras model
model = Sequential()
model.add(Dense(20, input_shape=(8,), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [17]:
from tensorflow import keras
# compile the keras model
opt = keras.optimizers.Adam(learning_rate=0.01) # Use 'Adam' algorithm
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [18]:
# fit the keras model on the dataset
model.fit(X, y, epochs=20, batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f88f87a0c50>

In [19]:
# evaluate the keras model
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 74.74


## Another Bonus: 

### 1. Use Grid search with **above model**

I have reference to this: https://www.geeksforgeeks.org/hyperparameter-tuning-using-gridsearchcv-and-kerasclassifier/

In [20]:
import warnings
warnings.filterwarnings('ignore')

About `StandardScaler()`:

Removing the **mean** and **scaling to unit variance**.

The standard score of a sample x is calculated as:

```z = (x - u) / s```

where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`.

In [21]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

In [22]:
# A copy of model above
def build_fn(unit):
  model = Sequential()
  model.add(Dense(20, input_shape=(8,), activation='relu'))
  model.add(Dense(16, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model

In [23]:
from keras.wrappers.scikit_learn import KerasClassifier
keras_model = KerasClassifier(build_fn=build_fn)

In [24]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Choose the best value of each defined parameter below
params={'batch_size':[100, 20, 50, 25, 32, 10], # Add '10' value to compare with the result above
        'nb_epoch':[200, 100, 300, 400, 20], # Add '20' value to compare with the result above
        'unit':[5,6, 10, 11, 12, 15],}

gs = GridSearchCV(estimator=keras_model, param_grid=params, cv=10)

In [25]:
gs.fit(X,y) # 'Fit' our (sample, target) to model and train again











GridSearchCV(cv=10,
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f88f63db790>,
             param_grid={'batch_size': [100, 20, 50, 25, 32, 10],
                         'nb_epoch': [200, 100, 300, 400, 20],
                         'unit': [5, 6, 10, 11, 12, 15]})

In [26]:
best_params=gs.best_params_
accuracy=gs.best_score_

In [27]:
print(best_params)
print(accuracy)

{'batch_size': 20, 'nb_epoch': 400, 'unit': 11}
0.7343130469322204


### 2. Use Grid search with **DecisionTreeClassifier**

The same implementation for this case. A little adjustment in model, use built-in `DecisionTreeClassifier` from `sklearn`, and the most interesting things is that we could focus on parameters from **Decision Tree** algorithm 

In [28]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [29]:
scaler = StandardScaler()
scaler.fit_transform(X)

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [30]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=20,random_state = 3)
dt = DecisionTreeClassifier() 
dt.fit(x_train,y_train)

DecisionTreeClassifier()

In [31]:
y_pred = dt.predict(x_test)
print("Accuracy score = ",accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)

Accuracy score =  0.75


array([[10,  3],
       [ 2,  5]])

In [32]:
# Choose the best hyperparameter in one of these:
params_dict = {
    "criterion" : ['gini','entropy'],
    "max_depth" : [1,2,3,4,5,6,7,None]
}

gs = GridSearchCV(dt,param_grid = params_dict,cv = 10)
gs.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, None]})

In [33]:
print(gs.best_params_)

{'criterion': 'gini', 'max_depth': 5}


# Ex6: Handling Malicious Packer

> Installing UPX from official repo https://github.com/1.upx/upx/releases, packing all the files in folder "Benign PE Samples UPX"

> Note: All the given files are already packed with UPX

In [34]:
import os
files_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Testing Samples/"
files = os.listdir(files_path)
file_paths = [files_path + x for x in files]

In [None]:
file_paths

In [None]:
!cat /etc/os-release # Check the version of OS for appropriate downloading

NAME="Ubuntu"
VERSION="18.04.6 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.6 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic


In [37]:
%cd "/content/drive/MyDrive/Shared Drive/Lab3/Tool"
!wget https://github.com/upx/upx/releases/download/v4.0.0/upx-4.0.0-amd64_linux.tar.xz -P "." # '-P' Output path (if used)

/content/drive/MyDrive/Shared Drive/Lab3/Tool
--2022-11-06 16:38:04--  https://github.com/upx/upx/releases/download/v4.0.0/upx-4.0.0-amd64_linux.tar.xz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/67031040/67157722-5558-4d57-aabd-9cea504ecbaa?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221106%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221106T163804Z&X-Amz-Expires=300&X-Amz-Signature=8d5ce45319869eaf20577820b7975455e550d47bb6824f032d5e5c17c10a65fb&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=67031040&response-content-disposition=attachment%3B%20filename%3Dupx-4.0.0-amd64_linux.tar.xz&response-content-type=application%2Foctet-stream [following]
--2022-11-06 16:38:04--  https://objects.githubusercontent.com/github-production-release-asset-2

In [38]:
!mkdir upx-4.0.0-linux
!tar -xvf upx-4.0.0-amd64_linux.tar.xz -C upx-4.0.0-linux # '-C': output path (if used)

mkdir: cannot create directory ‘upx-4.0.0-linux’: File exists
upx-4.0.0-amd64_linux/
upx-4.0.0-amd64_linux/COPYING
upx-4.0.0-amd64_linux/LICENSE
upx-4.0.0-amd64_linux/NEWS
upx-4.0.0-amd64_linux/README
upx-4.0.0-amd64_linux/THANKS
upx-4.0.0-amd64_linux/upx
upx-4.0.0-amd64_linux/upx-doc.html
upx-4.0.0-amd64_linux/upx-doc.txt
upx-4.0.0-amd64_linux/upx.1


In [39]:
!/content/drive/MyDrive/Shared\ Drive/Lab3/Tool/upx-4.0.0-linux/upx-4.0.0-amd64_linux/upx # Check if the excutable file works fine

                       Ultimate Packer for eXecutables
                          Copyright (C) 1996 - 2022
UPX 4.0.0       Markus Oberhumer, Laszlo Molnar & John Reiser   Oct 28th 2022

Usage: upx [-123456789dlthVL] [-qvfk] [-o file] file..

Commands:
  -1     compress faster                   -9    compress better
  -d     decompress                        -l    list compressed file
  -t     test compressed file              -V    display version number
  -h     give more help                    -L    display software license
Options:
  -q     be quiet                          -v    be verbose
  -oFILE write output to 'FILE'
  -f     force compression of suspicious files
  -k     keep backup files
file..   executables to (de)compress

Type 'upx --help' for more detailed help.

UPX comes with ABSOLUTELY NO WARRANTY; for details visit https://upx.github.io


In [40]:
# ================ DO NOT NEED TO RUN THIS SNIPPET CODE ================
import os
from subprocess import Popen, PIPE
cmd = "/content/drive/MyDrive/Shared Drive/Lab3/Tool/upx-4.0.0-linux/upx-4.0.0-amd64_linux/upx" # Path to "upx" 
for path in file_paths:
  # cmd2 = cmd + " \"" + path + "\""
  # print(cmd2)
  # res = Popen(cmd2, stdout=PIPE).communicate() --> This line has something wrong, producing errors.
  res = Popen([cmd, path], stdout=PIPE).communicate() # Output errors as the files are already packed
  print(res)
  if "error" in str(res[0]):
    print(path)
    os.remove(path)

(b'                       Ultimate Packer for eXecutables\n                          Copyright (C) 1996 - 2022\nUPX 4.0.0       Markus Oberhumer, Laszlo Molnar & John Reiser   Oct 28th 2022\n\n        File size         Ratio      Format      Name\n   --------------------   ------   -----------   -----------\n   4410880 ->   1828352   41.45%    win32/pe     appidtel.exe\n\nPacked 1 file.\n', None)
(b'                       Ultimate Packer for eXecutables\n                          Copyright (C) 1996 - 2022\nUPX 4.0.0       Markus Oberhumer, Laszlo Molnar & John Reiser   Oct 28th 2022\n\n        File size         Ratio      Format      Name\n   --------------------   ------   -----------   -----------\n   4406784 ->   1823744   41.38%    win32/pe     ARP.EXE\n\nPacked 1 file.\n', None)
(b'                       Ultimate Packer for eXecutables\n                          Copyright (C) 1996 - 2022\nUPX 4.0.0       Markus Oberhumer, Laszlo Molnar & John Reiser   Oct 28th 2022\n\n        File

File has already packed 

<center>
<img src="https://i.imgur.com/FF5pOg0.png" alt="upx-failed">
</center>

File has not yet packed

<center>
<img src="https://i.imgur.com/UKuQ6H7.png"  alt="upx-succeeded">
</center>

# Ex7: Build Packer Specifier Model

In [None]:
import os
from os import listdir
directories_with_labels = [
  ("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Benign PE Samples", 0),
  ("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Benign PE Samples UPX", 1),
  ("/content/drive/MyDrive/Shared Drive/Lab3/Dataset/Benign PE Samples Amber", 2),
]
list_of_samples = []
labels = []

In [None]:
for dataset_path, label in directories_with_labels:
  samples = [f for f in listdir(dataset_path)]
  for file in samples:
    file_path = os.path.join(dataset_path, file)
    list_of_samples.append(file_path)
    labels.append(label)

In [None]:
from sklearn.model_selection import train_test_split
samples_train, samples_test, labels_train, labels_test = train_test_split(
  list_of_samples, 
  labels, 
  test_size=0.3, 
  stratify=labels,
  random_state=11
)

In [None]:
import collections
from nltk import ngrams
import numpy as np

In [None]:
def read_file(file_path):
  """Reads in the binary sequence of a binary file."""
  with open(file_path, "rb") as binary_file:
    data = binary_file.read()
  return data

In [None]:
def byte_sequence_to_Ngrams(byte_sequence, N):
  """Creates a list of N-grams from a byte sequence."""
  Ngrams = ngrams(byte_sequence, N)
  return list(Ngrams)

In [None]:
def extract_Ngram_counts(file, N):
  """Takes a binary file and outputs the N-grams counts of its binary
  sequence."""
  filebyte_sequence = read_file(file)
  file_Ngrams = byte_sequence_to_Ngrams(filebyte_sequence, N)
  return collections.Counter(file_Ngrams)

In [None]:
def featurize_sample(sample, K1_most_frequent_Ngrams_list):
  """Takes a sample and produces a feature vector.
  The features are the counts of the K1 N-grams we've selected.
  """
  K1 = len(K1_most_frequent_Ngrams_list)
  feature_vector = K1 * [0]
  file_Ngrams = extract_Ngram_counts(sample, N)
  for i in range(K1):
    feature_vector[i] = file_Ngrams[K1_most_frequent_Ngrams_list[i]]
  return feature_vector

In [None]:
N_gram = 2
total_Ngram_count = collections.Counter([])
for file in samples_train:
  total_Ngram_count += extract_Ngram_counts(file, N_gram)
K1 = 100
K1_most_common_Ngrams = total_Ngram_count.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [None]:
Ngram_features_list_train = []
y_train = []
for i in range(len(samples_train)):
  file = samples_train[i]
  NGram_features = featurize_sample(file, K1_most_common_Ngrams_list)
  Ngram_features_list_train.append(NGram_features)
  y_train.append(labels_train[i])
X_train = Ngram_features_list_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)

In [None]:
Ngram_features_list_test = []
y_test = []
for i in range(len(samples_test)):
  file = samples_test[i]
  NGram_features = featurize_sample(file, K1_most_common_Ngrams_list)
  Ngram_features_list_test.append(NGram_features)
  y_test.append(labels_test[i])
X_test = Ngram_features_list_test

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score, roc_auc_score

print("Confusion matrix:\n %s" % confusion_matrix(y_test, y_pred))

# Additional metrics (Not required)
print("Accurcy: %s" %(accuracy_score(y_test, y_pred)))
print("Recall: %s" %(recall_score(y_test, y_pred, average='micro')))
print("Precision: %s" %(precision_score(y_test, y_pred, average='micro')))
print("F1 Score: %s" %(f1_score(y_test, y_pred, average='micro')))
print("ROC AUC: %s" %(roc_auc_score(y_test, y_pred)))

Confusion matrix:
 [[69  0]
 [ 0 23]]
Accurcy: 1.0
Recall: 1.0
Precision: 1.0
F1 Score: 1.0
ROC AUC: 1.0


# Ex8: MalGAN – Build malicious softwares

In [None]:
# Move to Python executable folder 
%cd "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN" 
# Original reference: https://www.kaggle.com/code/fanbyprinciple/malgan-can-we-create-malware-on-the-fly/notebook

/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN


In [None]:
import os
import pandas as pd
from keras.models import load_model

# =========== FIXING PYTHON LIB ===========
# Line 5:
# "from tf.compat.v1.keras.backend import set_session"
# Line 34:
# "set_session(tf.compat.v1.Session(config=config))"
import MalGAN_utils 

# In file "MalGAN_preprocess.py"
# Line 6, 7:
# "import tensorflow as tf"
# "from tensorflow.keras.utils import pad_sequences"

import MalGAN_gen_adv_examples

In [None]:
save_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN/MalGAN_output"
model_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN/MalGAN_input/malconv.h5"
log_path = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN/MalGAN_output/adversarial_log.csv"
pad_percent = 0.1
threshold = 0.6
step_size = 0.01
limit = 0.
input_samples = "/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN/MalGAN_input/samplesIn.csv"

In [None]:
# In line 31 of "MalGAN_utils.py", change into:
# config = tf.compat.v1.ConfigProto()
MalGAN_utils.limit_gpu_memory(limit)

In [None]:
df = pd.read_csv(input_samples, header=None)
fn_list = df[0].values
list(enumerate(fn_list))

[(0,
  '0778a070b283d5f4057aeb3b42d58b82ed20e4eb_f205bd9628ff8dd7d99771f13422a665a70bb916'),
 (1,
  'fbd1a4b23eff620c1a36f7c9d48590d2fccda4c2_cc82281bc576f716d9a0271d206beb81ad078b53'),
 (2,
  'c095da034535f15a27c073dce54212a28e1af683_8e86441bc4f6a7fc492779caf280f1d769e0cd4d'),
 (3,
  '488e5eea345e24440f7d0d2a32fbafda314ee6ca_df473c0493d503828157e32664e28357a4094f7a'),
 (4,
  '7a359bcc1c7ac5f18eff7c3459dadefa9f9e4610_3b7ac6b0a7a720460526c78628c8616dad8c6a1f'),
 (5,
  '509038aad80431b8aa0c9b29bfce07fe7134fc7a_263fbe72e691a0c047f75ce75a585ba0af84ac94'),
 (6,
  '05202b7ebc42f2a159154f99cec58fc1bcfe2c17_8cc75062dbe1ddc363fa5178312845709f669e37'),
 (7,
  'c47ed37c0e1e8be110f889f5989aea6b1bb7fda4_42c3033e34d27c951f7db7ae6aa5f45b8ef472ac'),
 (8,
  '18a62e8ee8522c26e9970373895209ee15a56841_4f0f6e9d21bbbe1842e8e8d6d911561108389662'),
 (9,
  '12113db281913797b6d58079b45089d4d057f766_ce1c8adb310886deb3ace648d4152ddeb1ed32fb'),
 (10,
  '19c625db4021d4934916f42b97777098b0b8cf15_0192ba11326fe2298c8c

In [None]:
model = load_model(model_path)
model.summary() # Have an overview of model
# Use 'layers[0]' as input and 'layer[1]' as output for the next snippet code

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 80000)]      0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 80000, 8)     2048        ['input_1[0][0]']                
                                                                                                  
 conv1d_2 (Conv1D)              (None, 160, 128)     512128      ['embedding_1[0][0]']            
                                                                                                  
 conv1d_1 (Conv1D)              (None, 160, 128)     512128      ['embedding_1[0][0]']            
                                                                                            

In [None]:
model.layers[0].input

<KerasTensor: shape=(None, 80000) dtype=float32 (created by layer 'input_1')>

In [None]:
model.layers[1].output

<KerasTensor: shape=(None, 80000, 8) dtype=float32 (created by layer 'embedding_1')>

In [None]:
adv_samples, log = MalGAN_gen_adv_examples.gen_adv_samples(
  model,
  fn_list, 
  pad_percent, 
  step_size, 
  threshold
)

0778a070b283d5f4057aeb3b42d58b82ed20e4eb_f205bd9628ff8dd7d99771f13422a665a70bb916 not exist
inp:  []
len_List:  80000


ValueError: ignored

In [None]:
with open('/content/drive/MyDrive/Shared Drive/Lab3/Dataset/MalGAN/MalGAN_gen_adv_examples.py', 'r') as f:
  print(f.read())

from sklearn.neighbors import NearestNeighbors
from tensorflow.keras import backend as K
import MalGAN_utils
from MalGAN_preprocess import preprocess
import numpy as np

def gen_adv_samples(model, fn_list, pad_percent=0.1, step_size=0.001, thres=0.5):

    ###   search for nearest neighbor in embedding space ###
    def emb_search(org, adv, pad_idx, pad_len, neigh):
        out = org.copy()
        for idx in range(pad_idx, pad_idx+pad_len):
            target = adv[idx].reshape(1, -1)
            best_idx = neigh.kneighbors(target, 1, False)[0][0]
            out[0][idx] = best_idx
        return out


    max_len = int(model.input.shape[1])
    emb_layer = model.layers[1]
    emb_weight = emb_layer.get_weights()[0]
    inp2emb = K.function([model.layers[0].input], [model.layers[1].output]) # [function] Map sequence to embedding

    # Build neighbor searches
    neigh = NearestNeighbors()
    neigh.fit(emb_weight)

    log = MalGAN_utils.logger()
    adv_samples = []

    for e, fn i

In [None]:
log.save(log_path)
for fn, adv in zip(fn_list, adv_samples):
  _fn = fn.split('/')[-1]
  dst = os.path.join(save_path, _fn)
  print(dst)
  with open(dst, 'wb') as f:
    f.write(adv)