<a href="https://colab.research.google.com/github/summermccune/Tokenization-Testing-for-Malware-Data/blob/main/Word2Vec/Word2Vec%26SVM_wordpiece.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Wordpiece implementation based on huggingface
https://huggingface.co/learn/nlp-course/chapter6/6

In [22]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import glob
from sklearn.svm import SVC
from transformers import AutoTokenizer
from collections import defaultdict


In [23]:
def read_files(malwarefolder, malwareType):
  for sample in malwarefolder:
    with open(sample, 'r') as f:
      data = f.read()
      data = data.replace('\n', ' ')
    #append sample to opcodes list in df
    df.loc[len(df)] = [data, malwareType]

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
#from hugging face implementation
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

In [26]:
#from huggingface implementation
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [27]:
#from huggingface implementation
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [28]:
#from huggingface implementation
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [None]:
#creating dataframe for samples and label
df = pd.DataFrame(columns = ['opcodes','label'])

#specify paths for each malware
winwebsec = glob.glob("/content/drive/MyDrive/Data/malware2/winwebsec/*.txt")
zbot = glob.glob("/content/drive/MyDrive/Data/malware2/zbot/*.txt")
zeroaccess = glob.glob("/content/drive/MyDrive/Data/malware2/zeroaccess/*.txt")

#read files
read_files(winwebsec, 0)
read_files(zbot, 1)
read_files(zeroaccess, 2)

#huggingface wordpiece implementation
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

word_freqs = defaultdict(int)
for text in df['opcodes']:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()

vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break

best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

vocab_size = 70 #could change this variable
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)


In [30]:
tokenized_opcodes = []
for sample in df['opcodes']:
  tokenized_opcodes.append(tokenize(sample))

df['tokenized_opcodes'] = tokenized_opcodes

print(df.shape)
df.head()

(7801, 3)


Unnamed: 0,opcodes,label,tokenized_opcodes
0,push mov sub push push mov mov mov mov mov mov...,0,"[p, ##u, ##s, ##h, m, ##o, ##v, s, ##u, ##b, p..."
1,push mov sub mov mov call mov pop retn push mo...,0,"[p, ##u, ##s, ##h, m, ##o, ##v, s, ##u, ##b, m..."
2,push mov sub xor mov push add mov mov cmp jnz ...,0,"[p, ##u, ##s, ##h, m, ##o, ##v, s, ##u, ##b, x..."
3,push mov sub push push push mov mov mov mov mo...,0,"[p, ##u, ##s, ##h, m, ##o, ##v, s, ##u, ##b, p..."
4,mov push mov sub lea add mov push call push pu...,0,"[m, ##o, ##v, p, ##u, ##s, ##h, m, ##o, ##v, s..."


In [31]:
#word2vec
model = Word2Vec(df['tokenized_opcodes'], min_count=1)

In [32]:
#SVM
X = np.array([np.mean([model.wv[word] for word in text if word in model.wv]
                        , axis=0) for text in df['opcodes']])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

#accuracy and conf matrix
print("Accuracy:", np.mean(y_pred == y_test))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9295774647887324
[[436   6   0]
 [ 42 152   6]
 [  1   0 138]]
