# Classifying code from scratch
This file implements this tutorial (https://huggingface.co/docs/transformers/tasks/sequence_classification) with the PHP dataset

In [2]:
# test imports
import re
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from datasets import Dataset
import torch

from tokenizers import ByteLevelBPETokenizer

### Data preprocessing
csv_name is the result of a NIST dataset run through Kyle's parser<br>
folder_name is where all the data files are actually stored>br>
num_cases is the number of test cases you want to process (concretely, the first num_cases data points are analyzed)

file_formatting incrementally removes all types of comments by specific regex. The data files seem to have HTML, Python, and PHP style comments. Then we remove newlines.

files is a list of strings containing the formatted file contents<br>
labels is a list of 0s and 1s, where 0 = good and 1 - bad

In [3]:
# php csv
csv_name = 'parsed_data.csv'
folder_name = 'data/2022-05-12-php-test-suite-sqli-v1-0-0/'

df = pd.read_csv(csv_name)
df_len = df.shape[0]
# num_cases = int(df_len / 4)
num_cases = 100
df = df.head(num_cases) # take top (num_cases) files for now
filenames = df['file_location']

def file_formatting(file_location):
    file_path = file_location
    raw_contents = open(folder_name + file_path, "r").read()
    remove = re.sub("(<!--.*?-->)", "", raw_contents, flags=re.DOTALL) # html
    remove = re.sub('#.*?\n', '', remove, flags=re.DOTALL) # python
    remove = re.sub('\/\*\*[^*]*\*+([^/][^*]*\*+)*\/', '', remove, flags=re.S) # php
    remove = remove.replace('\n', '').replace(' ','') # newlines
    return remove

# data contains strings of all files
files = []
for f in filenames:
    try:
        fstring = file_formatting(f)
    except:
        pass
    files.append(fstring)

# get labels
labels = df['state']
def replace_good_bad(lst):
    mapping = {"good": 0, "bad": 1}
    return [mapping.get(item, item) for item in lst]
labels = replace_good_bad(labels)

# create df of files and labels
data = pd.DataFrame({'label': labels, 'file': files})
#print(data, files, labels)

### Preparing dataset
Converts dataframe into Tensorflow dataset, adds a train/test split to dataset, converts to correct Dataset input type

In [15]:
model_name = 'microsoft/codebert-base' # code

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# creating tf dataset
tf_data = Dataset.from_pandas(data)
tf_data = tf_data.train_test_split(test_size=0.3)

def preprocess_function(examples):
    return tokenizer(examples["file"], truncation=True)
#tf_data = tf_data.map(preprocess_function, batched=True)

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["file"], truncation=True)

tf_data = tf_data.map(tokenize_dataset)
#tf_data = model.prepare_tf_dataset(tf_data, batch_size=16, shuffle=True, tokenizer=tokenizer)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/70 [00:00<?, ?ex/s]

  0%|          | 0/30 [00:00<?, ?ex/s]

### Adding evaluation
This cell is separate because the evaluate imports can cause errors

In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Hyperparameter and label initialization
Play with the hyperparams!

In [20]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
# positive = vulnerable

from transformers import create_optimizer

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tf_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Convert dataset to correct input type

In [22]:
tf_train_set = model.prepare_tf_dataset(
    tf_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tf_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

model.compile(optimizer=optimizer)


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


### Train model!
This will take a while

In [24]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

callbacks = [metric_callback]
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fef8fff14f0>

### Test one case
Ensure labels and outputs are working

In [28]:
# one case
#tokenizer = AutoTokenizer.from_pretrained(model)
inputs = tokenizer(files[0], return_tensors="tf")

#model = TFAutoModelForSequenceClassification.from_pretrained(model)
logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
print(model.config.id2label[predicted_class_id])
print(labels[0])

NEGATIVE
0


### Model accuracy
This cell computes the accuracy of the model a bit slowly. Feel free to improve and add model evaluation methods are desired

In [30]:
acc = 0

for idx in range(len(files)):
    inputs = tokenizer(files[idx], return_tensors="tf")
    logits = model(**inputs).logits
    predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
    print(idx, model.config.id2label[predicted_class_id], labels[idx])
    if predicted_class_id == labels[idx]:
        acc = acc + 1

print(acc/len(files))

NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 1
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0
NEGATIVE 0