<a href="https://colab.research.google.com/github/summermccune/Tokenization-Testing-for-Malware-Data/blob/main/Tokenization_ALL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install datasets
!pip install hmmlearn

[0m

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from collections import Counter
from datasets import load_dataset
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer, ByteLevelBPETokenizer, SentencePieceBPETokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from transformers import PreTrainedTokenizerFast
from gensim.models import Word2Vec
from hmmlearn import hmm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,accuracy_score, f1_score, precision_score, recall_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data

In [3]:
#read in the data
FakeRean = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/FakeRean.csv')
OnLineGames = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/OnLineGames.csv')
Vobfus = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/Vobfus.csv')
Winwebsec = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/Winwebsec.csv')
BHO = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/BHO.csv')
CeeInject = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/CeeInject.csv')
Renos = pd.read_csv('/content/drive/MyDrive/Tokenization-Final/Families/Renos.csv')

#make csvs to df with train and test columns
dataset = pd.concat([FakeRean, OnLineGames, Vobfus, Winwebsec, BHO, CeeInject, Renos], ignore_index=True)
#SAMPLE OF DATASET - only for testing purposes
dataset = dataset.sample(frac=0.025, random_state=1)

#drop first column
dataset = dataset.iloc[:, 1:]

print(dataset.head())
print(dataset.tail())

            Label                                            Opcodes
2305       Vobfus  ljmp jae xchg push jae or out push jae add lea...
4388          BHO  add inc add pop inc outsl outsl insb gs outsb ...
1686  OnLineGames  pop fmuls mov ret nop mov icebp insb ljmp call...
4945          BHO  push mov cmpl jne mov leave ret cmpl jne cmpl ...
4197          BHO  add inc add pop inc outsl outsl insb gs outsb ...
          Label                                            Opcodes
5305  CeeInject  call mov mov mov sub sar mov mov mov mov sub s...
6476      Renos  jmp mov jmp mov jmp mov jmp mov jmp mov jmp mo...
3037  Winwebsec  push mov mov and and subl sub mov aam inc xor ...
5632  CeeInject  push mov movsbl mov imul mov mov imul mov mov ...
2821     Vobfus  ljmp jae xchg push jae or out push jae add fis...


# Functions

In [4]:
def count(df, c):
  for row in df['Opcodes']:
     data = row.split()
     c.update(data)

In [5]:
def removeNonVocab(vocab, series):
  rows = []
  vocab_str = '|'.join(vocab)
  pattern = '\\b((?!\\b( |' + vocab_str + ')\\b).)*\\b'
  for row in series:
    row = re.sub(pattern, '', row)
    row = re.sub(' +', ' ', row)
    rows.append(row.split())
  return rows

In [6]:
def opcodes_to_numbers_dict(df):
  #creating a list of number representation for each opcode that is in the dataset
  opcode_to_number = {}
  count = 0
  for opcode in df['Opcodes']:
    opcode_to_number[opcode] = count
    count += 1
  print(opcode_to_number)

  return opcode_to_number

In [7]:
def opcodes_to_numbers(dataset):
  opcode_to_number = opcodes_to_numbers_dict(dataset)
  columns = dataset['Opcodes']
  opcode_sequences = []
  for sample in columns:
    temp = []
    for opcode in sample:
      temp.append(opcode_to_number[opcode])
    opcode_sequences.append(temp)
  return opcode_sequences

In [8]:
def train_hmm_models(opcodes,n_states):
  hmm_models = []
  for opcode_seq in opcodes:
    n_restarts = 100 if len(opcode_seq) <= 5000 else 50
    best_model = None
    best_score = -np.inf

    for i in range(n_restarts):
      model = hmm.CategoricalHMM(n_components=n_states, n_iter=100)
      opcode_seq = np.array(opcode_seq)
      model.fit(opcode_seq.reshape(-1, 1))

      #check if the model has a higher score than the current best model
      score = model.score(opcode_seq.reshape(-1, 1))
      if score > best_score:
        best_model = model
        best_score = score

    hmm_models.append(best_model)

  return hmm_models

In [9]:
def b_matrix_to_features(hmm_models, max_feature_length):
  hmm2vec_features = []
  for model in hmm_models:
    #determine the hidden state that has the highest probability with respect to the mov opcode
    mov_index = np.argmax(model.emissionprob_[:, opcode_to_number['mov']])

    #deem this to be the first half of the HMM2Vec feature vector, with the other row of the B matrix being the second half of the vector
    sorted_indices = [mov_index, 1 - mov_index]
    sorted_bmatrices = model.emissionprob_[sorted_indices]

    # Flatten the rearranged B matrix to create HMM2Vec feature vector
    feature_vector = sorted_bmatrices.flatten()

    # pad or truncate feature_vector to ensure consistent length
    if len(feature_vector) < max_feature_length:
      feature_vector = np.pad(feature_vector, (0, max_feature_length - len(feature_vector)), mode='constant')
    elif len(feature_vector) > max_feature_length:
      feature_vector = feature_vector[:max_feature_length]

    hmm2vec_features.append(feature_vector)

  return hmm2vec_features

In [10]:
def batch_iterator(data):
  for i in range(0, len(dataset), batch_size):
      yield dataset['Opcodes'][i : i + batch_size]

In [11]:
def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size=v_size)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens, vocab_size=v_size)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens, vocab_size=v_size)

    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

In [12]:
def train_tokenizer(alg):
    """
    Trains the tokenizer
    """
    if (alg == 'BPE' or alg == 'UNI' or alg == 'WPC'):
      tokenizer, trainer = prepare_tokenizer_trainer(alg)
      tokenizer.train_from_iterator(batch_iterator('train'), trainer=trainer)
      return tokenizer
    elif alg == 'BBPE':
      tokenizer = ByteLevelBPETokenizer()
      tokenizer.train_from_iterator(batch_iterator('train'),vocab_size=v_size)
    elif alg == 'SPC':
      tokenizer = SentencePieceBPETokenizer()
      tokenizer.train_from_iterator(batch_iterator('train'),vocab_size=v_size)
    tokenizer.save("drive/MyDrive/Tokenization-Final/Tokenizers/"+alg+"-trained.json")
    return tokenizer

In [13]:
def encode(tokenizer, df):
  encode = tokenizer.encode_batch(df['Opcodes'])
  tokens = [encoding.tokens for encoding in encode]
  return tokens

# Tokenization

In [14]:
#set up values
batch_size = 1000
unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]
v_size = 100

#### Top 31 unigrams


---



In [None]:
#make new df with opcode and malware as columns
TOP31_df = dataset.copy()

#counting opcodes for data cleaning
countTotal = Counter()
count(TOP31_df, countTotal)

#list for most common opcodes
total_count = countTotal.most_common(31)
countList = [x[0] for x in total_count]
print(countList)

#store cleaned rows
rows = removeNonVocab(countList, TOP31_df['Opcodes'])
TOP31_df['Opcodes'] = rows

print(TOP31_df.head())
print(TOP31_df.tail())

#make csv of most common opcodes and their count
df = pd.DataFrame(total_count, columns=['Opcodes', 'Count'])
df.to_csv('MostCommonOpcodes.csv', index=False)

#convert tokenized dataframe to pkl
TOP31_df.to_pickle('drive/MyDrive/Tokenization-Final/Tokenized Data/TOP31.pkl')

#### BPE Tokenizer

---





In [15]:
BPE_tokenizer = train_tokenizer('BPE')

#make new df
BPE_df = dataset.copy()

BPE_tokens = encode(BPE_tokenizer, BPE_df)
print(BPE_tokens[0])

BPE_df['Opcodes'] = BPE_tokens
print(BPE_df.head())
print(BPE_df.tail())

#convert BPE tokens to pkl
BPE_df.to_pickle('drive/MyDrive/Tokenization-Final/Tokenized Data/BPE.pkl')

['l', 'jmp', 'ja', 'e', 'xchg', 'push', 'ja', 'e', 'or', 'out', 'push', 'ja', 'e', 'add', 'lea', 'ja', 'e', 'l', 'ret', 'in', 'inc', 'ja', 'e', 'push', 'dec', 'ja', 'e', 'pop', 'push', 'ja', 'e', 'push', 'push', 'ja', 'e', 'mov', 'push', 'ja', 'e', 'add', 'ja', 'push', 'ja', 'e', 'ja', 'e', 'ja', 'e', 'pop', 'push', 'ja', 'e', 'ja', 'e', 'ja', 'e', 'je', 'ja', 'e', 'push', 'push', 'ja', 'e', 'push', 'ja', 'e', 'mov', 'push', 'ja', 'e', 'cmp', 'dec', 'r', 'or', 'b', 'mov', 'dec', 'ja', 'e', 'l', 'ret', 'dec', 'ja', 'e', 'inc', 'push', 'ja', 'e', 's', 'a', 'h', 'f', 'dec', 'ja', 'e', 'add', 'inc', 'add', 'xchg', 'dec', 'ja', 'e', 'je', 'ja', 'e', 'xchg', 'push', 'ja', 'e', 'jn', 'o', 'ja', 'e', 'jne', 'ja', 'e', 'xchg', 'push', 'ja', 'e', 'mov', 'push', 'ja', 'e', 'adc', 'b', 'out', 'sb', 'push', 'ja', 'e', 'out', 'sb', 'push', 'ja', 'e', 'jn', 'p', 'ja', 'e', 'pop', 'a', 'push', 'ja', 'e', 'j', 'l', 'ja', 'e', 'aa', 's', 'dec', 'ja', 'e', 'cl', 't', 'd', 'dec', 'ja', 'e', 'j', 'g', 'e',

#### WPC (WordPiece)


---



In [None]:
WPC_tokenizer = train_tokenizer('WPC')

#make new df
WPC_df = dataset.copy()

#encode
WPC_tokens = encode(WPC_tokenizer, WPC_df)
print(WPC_tokens[0])

WPC_df['Opcodes'] = WPC_tokens
print(WPC_df.head())
print(WPC_df.tail())

#convert WPC tokens to pkl
WPC_df.to_pickle('drive/MyDrive/Tokenization-Final/Tokenized Data/WPC.pkl')

#### SPC (SentencePiece)


---



In [16]:
SPC_tokenizer = train_tokenizer('SPC')

#make new df
SPC_df = dataset.copy()

#encode
SPC_tokens = encode(SPC_tokenizer, SPC_df)
print(SPC_tokens[0])

SPC_df['Opcodes'] = SPC_tokens
print(SPC_df.head())
print(SPC_df.tail())

#convert SPC tokens to pkl
SPC_df.to_pickle('drive/MyDrive/Tokenization-Final/Tokenized Data/SPC.pkl')

['▁l', 'j', 'mp', '▁j', 'a', 'e', '▁xchg', '▁push', '▁j', 'a', 'e', '▁or', '▁out', '▁push', '▁j', 'a', 'e', '▁add', '▁lea', '▁j', 'a', 'e', '▁l', 'r', 'et', '▁in', '▁inc', '▁j', 'a', 'e', '▁push', '▁dec', '▁j', 'a', 'e', '▁pop', '▁push', '▁j', 'a', 'e', '▁push', '▁push', '▁j', 'a', 'e', '▁mov', '▁push', '▁j', 'a', 'e', '▁add', '▁j', 'a', '▁push', '▁j', 'a', 'e', '▁j', 'a', 'e', '▁j', 'a', 'e', '▁pop', '▁push', '▁j', 'a', 'e', '▁j', 'a', 'e', '▁j', 'a', 'e', '▁je', '▁j', 'a', 'e', '▁push', '▁push', '▁j', 'a', 'e', '▁push', '▁j', 'a', 'e', '▁mov', '▁push', '▁j', 'a', 'e', '▁cmp', '▁dec', '▁r', 'or', 'b', '▁mov', '▁dec', '▁j', 'a', 'e', '▁l', 'r', 'et', '▁dec', '▁j', 'a', 'e', '▁inc', '▁push', '▁j', 'a', 'e', '▁s', 'a', 'h', 'f', '▁dec', '▁j', 'a', 'e', '▁add', '▁inc', '▁add', '▁xchg', '▁dec', '▁j', 'a', 'e', '▁je', '▁j', 'a', 'e', '▁xchg', '▁push', '▁j', 'a', 'e', '▁jn', 'o', '▁j', 'a', 'e', '▁jn', 'e', '▁j', 'a', 'e', '▁xchg', '▁push', '▁j', 'a', 'e', '▁mov', '▁push', '▁j', 'a', 'e', '▁

#### UNI (Unigram Subword)


---



In [15]:
UNI_tokenizer = train_tokenizer('UNI')

#make new df
UNI_df = dataset.copy()

#encode
UNI_tokens = encode(UNI_tokenizer, UNI_df)
print(UNI_tokens[0])

UNI_df['Opcodes'] = UNI_tokens
print(UNI_df.head())
print(UNI_df.tail())

#convert UNI tokens to pkl
UNI_df.to_pickle('drive/MyDrive/Tokenization-Final/Tokenized Data/UNI.pkl')

['ljmp', 'j', 'a', 'e', 'xchg', 'push', 'j', 'a', 'e', 'o', 'r', 'out', 'push', 'j', 'a', 'e', 'add', 'lea', 'j', 'a', 'e', 'lret', 'in', 'inc', 'j', 'a', 'e', 'push', 'dec', 'j', 'a', 'e', 'p', 'o', 'p', 'push', 'j', 'a', 'e', 'push', 'push', 'j', 'a', 'e', 'mov', 'push', 'j', 'a', 'e', 'add', 'j', 'a', 'push', 'j', 'a', 'e', 'j', 'a', 'e', 'j', 'a', 'e', 'p', 'o', 'p', 'push', 'j', 'a', 'e', 'j', 'a', 'e', 'j', 'a', 'e', 'j', 'e', 'j', 'a', 'e', 'push', 'push', 'j', 'a', 'e', 'push', 'j', 'a', 'e', 'mov', 'push', 'j', 'a', 'e', 'cmp', 'dec', 'r', 'o', 'r', 'b', 'mov', 'dec', 'j', 'a', 'e', 'lret', 'dec', 'j', 'a', 'e', 'inc', 'push', 'j', 'a', 'e', 's', 'a', 'h', 'f', 'dec', 'j', 'a', 'e', 'add', 'inc', 'add', 'xchg', 'dec', 'j', 'a', 'e', 'j', 'e', 'j', 'a', 'e', 'xchg', 'push', 'j', 'a', 'e', 'j', 'n', 'o', 'j', 'a', 'e', 'j', 'n', 'e', 'j', 'a', 'e', 'xchg', 'push', 'j', 'a', 'e', 'mov', 'push', 'j', 'a', 'e', 'a', 'd', 'c', 'b', 'outs', 'b', 'push', 'j', 'a', 'e', 'outs', 'b', 'p

# Embedding

#### Word2Vec


---



In [None]:
def word2vec_embeddings(dataframe):
  #word2vec N = 100, W = 30
  model = Word2Vec(dataframe['Opcodes'], min_count=1, vector_size=100, window=30)
  embeddings = np.array([np.mean([model.wv[word] for word in text if word in model.wv], axis=0) for text in dataframe['Opcodes']])
  np.save('word2vec_ALL_embeddings.npy', embeddings)

  #word2vec N = 31, W = 10
  model = Word2Vec(dataframe['Opcodes'], min_count=1, vector_size=31, window=10)
  embeddings = np.array([np.mean([model.wv[word] for word in text if word in model.wv], axis=0) for text in dataframe['Opcodes']])
  np.save('word2vec_SVM_embeddings.npy', embeddings)
  return embeddings

TOP31

In [None]:
TOP31_w2v_embeddings = word2vec_embeddings(TOP31_df)
#save as pkl
TOP31_w2v_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/TOP31_w2v_embeddings.pkl')

BPE

In [None]:
BPE_w2v_embeddings = word2vec_embeddings(BPE_df)
#save as pkl
BPE_w2v_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/BPE_w2v_embeddings.pkl')

WPC

In [None]:
WPC_w2v_embeddings = word2vec_embeddings(WPC_df)
#save as pkl
WPC_w2v_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/WPC_w2v_embeddings.pkl')

SPC

In [None]:
SPC_w2v_embeddings = word2vec_embeddings(SPC_df)
#save as pkl
SPC_w2v_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/SPC_w2v_embeddings.pkl')

UNI

In [None]:
UNI_w2v_embeddings = word2vec_embeddings(UNI_df)
#save as pkl
UNI_w2v_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/UNI_w2v_embeddings.pkl')

#### HMM2Vec


---



In [None]:
def hmm2vec_embeddings(dataframe, n_states):
  #convert opcodes to numbers
  opcode_sequences = opcodes_to_numbers(dataframe)
  hmm_models = train_hmm_models(opcode_sequences, n_states)
  hmm2vec_features = b_matrix_to_features(hmm_models, 100)
  return hmm2vec_features

TOP31

In [None]:
TOP31_hmm2vec_embeddings = hmm2vec_embeddings(TOP31_df, 2)
#save as pkl
TOP31_hmm2vec_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/TOP31_hmm2vec_embeddings.pkl')

BPE

In [None]:
BPE_hmm2vec_embeddings = hmm2vec_embeddings(BPE_df, 2)
#save as pkl
BPE_hmm2vec_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/BPE_hmm2vec_embeddings.pkl')

WPC

In [None]:
WPC_hmm2vec_embeddings = hmm2vec_embeddings(WPC_df, 2)
#save as pkl
WPC_hmm2vec_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/WPC_hmm2vec_embeddings.pkl')

SPC

In [None]:
SPC_hmm2vec_embeddings = hmm2vec_embeddings(SPC_df, 2)
#save as pkl
SPC_hmm2vec_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/SPC_hmm2vec_embeddings.pkl')

UNI

In [None]:
UNI_hmm2vec_embeddings = hmm2vec_embeddings(UNI_df, 2)
#save as pkl
UNI_hmm2vec_embeddings.to_pickle('drive/MyDrive/Tokenization-Final/Embeddings/UNI_hmm2vec_embeddings.pkl')

# Classification

In [None]:
def SVM_model(X_train, y_train, X_test, y_test, kernel, name):
  svm = SVC(kernel=kernel)
  svm.fit(X_train, y_train)
  y_pred = svm.predict(X_test)

  print("Accuracy:", accuracy_score(y_test, y_pred))
  print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
  print("Precision:", precision_score(y_test, y_pred, average='weighted'))
  print("Recall:", recall_score(y_test, y_pred, average='weighted'))
  cm = confusion_matrix(y_test, y_pred)
  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
  names = ['BHO','CeeInject','FakeRean', 'OnLineGames', 'Vobfus', 'Winwebsec']
  disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=names)
  disp.plot(cmap=plt.cm.Blues)
  plt.title(name)
  plt.show()

## TOP31


---



In [None]:
#TOP31 train and test
#word2vec
TOP31_w2v_train, TOP31_w2v_test = train_test_split(TOP31_w2v_embeddings, test_size=0.2, random_state=42)
#hmm2vec
TOP31_hmm2vec_train, TOP31_hmm2vec_test = train_test_split(TOP31_hmm2vec_embeddings, test_size=0.2, random_state=42)

### SVM

In [None]:
SVM_model(TOP31_w2v_train, y_train, TOP31_w2v_test, y_test, 'linear', 'TOP31 + w2v + SVM')

In [None]:
SVM_model(TOP31_hmm2vec_train, y_train, TOP31_hmm2vec_test, y_test, 'linear', 'TOP31 + hmm2vec + SVM')

### RF

## BPE


---



In [None]:
#BPE train and test
#word2vec
BPE_w2v_train, BPE_w2v_test = train_test_split(BPE_w2v_embeddings, test_size=0.2, random_state=42)
#hmm2vec
BPE_hmm2vec_train, BPE_hmm2vec_test = train_test_split(BPE_hmm2vec_embeddings, test_size=0.2, random_state=42)

### SVM

In [None]:
SVM_model(TOP31_w2v_train, y_train, TOP31_w2v_test, y_test, 'linear', 'TOP31 + w2v + SVM'

### RF

## WPC


---



In [None]:
#WPC train and test
#word2vec
WPC_w2v_train, WPC_w2v_test = train_test_split(WPC_w2v_embeddings, test_size=0.2, random_state=42)
#hmm2vec
WPC_hmm2vec_train, WPC_hmm2vec_test = train_test_split(WPC_hmm2vec_embeddings, test_size=0.2, random_state=42)

### SVM

### RF

## SPC


---



In [None]:
#SPC train and test
#word2vec
SPC_w2v_train, SPC_w2v_test = train_test_split(SPC_w2v_embeddings, test_size=0.2, random_state=42)
#hmm2vec
SPC_hmm2vec_train, SPC_hmm2vec_test = train_test_split(SPC_hmm2vec_embeddings, test_size=0.2, random_state=42)

### SVM

### RF

## UNI


---



In [None]:
#UNI train and test
#word2vec
UNI_w2v_train, UNI_w2v_test = train_test_split(UNI_w2v_embeddings, test_size=0.2, random_state=42)
#hmm2vec
UNI_hmm2vec_train, UNI_hmm2vec_test = train_test_split(UNI_hmm2vec_embeddings, test_size=0.2, random_state=42)

### SVM

### RF