In [1]:
!nvidia-smi

Mon Dec  9 13:30:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Init

In [14]:
!pip install transformers

!pip install datasets

!pip install sacrebleu

!pip install torch>=1.13.0

!pip install fsspec==2024.9.0



In [15]:
import random
random.seed(120)

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [17]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


## Load Datasets

In [18]:
BASE_PATH = "/content/gdrive/MyDrive/experiments"
TRAIN_UNRESTRICTED_PATH = f"{BASE_PATH}/unrestricted_train_dataset-50.csv"
TRAIN_RESTRICTED_PATH = f"{BASE_PATH}/restricted_train_dataset.csv"
OOV_UNRESTRICTED_PATH = f"{BASE_PATH}/unrestricted_test_dataset.csv"
OOV_RESTRICTED_PATH = f"{BASE_PATH}/restricted_test_dataset.csv"

In [19]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets


unrestricted_train = load_dataset('csv', data_files=TRAIN_UNRESTRICTED_PATH)
restricted_train = load_dataset('csv', data_files=TRAIN_RESTRICTED_PATH)
unrestricted_test = load_dataset('csv', data_files=OOV_UNRESTRICTED_PATH)
restricted_test = load_dataset('csv', data_files=OOV_RESTRICTED_PATH)

unrestricted_train['train'] = unrestricted_train['train'].add_column('label', [0] * len(unrestricted_train['train']))
unrestricted_test['train'] = unrestricted_test['train'].add_column('label', [0] * len(unrestricted_test['train']))
restricted_train['train'] = restricted_train['train'].add_column('label', [1] * len(restricted_train['train']))
restricted_test['train'] = restricted_test['train'].add_column('label', [1] * len(restricted_test['train']))

train_dataset = concatenate_datasets([unrestricted_train['train'], restricted_train['train']]).shuffle()


test_dataset = concatenate_datasets([unrestricted_test['train'], restricted_test['train']]).shuffle()

dataset = DatasetDict({'train': train_dataset,'test': test_dataset})

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pair_type', 'ltl', 'en', 'label'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['pair_type', 'ltl', 'en', 'label'],
        num_rows: 15000
    })
})

## Preprocessing

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["en"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
item = tokenized_dataset['train']['en'][0]
decoded_item = tokenizer.decode(tokenized_dataset['train']['input_ids'][0])

reversed_vocab = {i: w for w, i in tokenizer.get_vocab().items()}

print(item)
print(decoded_item)
print(len(item.split(' ')), len(tokenized_dataset['train']['input_ids'][0]), len(decoded_item.split(' ')))
print(tokenized_dataset['train']['input_ids'][0])
print([reversed_vocab[i] for i in tokenized_dataset['train']['input_ids'][0]])

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train']['label'][49000:49010]

## Training

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.resize_token_embeddings(len(tokenizer))

In [None]:
print("For T5:")
print("Tokenizer vocab_size: {}".format(tokenizer.vocab_size))
print("Model vocab size: {}\n".format(model.config.vocab_size))

In [None]:
RESULTS_DIR = "/content/gdrive/MyDrive/experiments/classifier"

STEPS = 31

training_args = TrainingArguments(
    output_dir=RESULTS_DIR,
    evaluation_strategy="steps",
    eval_steps=STEPS,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    # num_train_epochs=1,
    max_steps=STEPS,
    adam_beta2=0.98,
    warmup_steps=100,
    optim="adamw_torch",
    fp16=True,
    save_strategy='steps',
    save_steps=STEPS-1
)

In [None]:
import numpy as np
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    preds = eval_preds.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = metric.compute(references=labels, predictions=preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'preds': preds,
        'labels': labels
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# trainer.train("/content/gdrive/MyDrive/final_thesis_experiments/checkpoint-5625/")
trainer.train()

## Test

In [None]:
trainer.evaluate(tokenized_dataset["test"])

In [None]:
from transformers import DistilBertForSequenceClassification

In [None]:
list(tokenized_dataset['test']['en'][:300])

In [None]:
from transformers import T5ForConditionalGeneration

prediction_model = AutoModelForSeq2SeqLM.from_pretrained("/content/gdrive/MyDrive/final_thesis_experiments/49_90/checkpoint-5625/")

# model.to('cpu')

src_text = tokenized_dataset['test']['en'][2500:2600] # "translate English to LTL: " + tokenized_dataset['oov']['en'][0]
tokens = tokenizer(src_text, return_tensors="pt", padding=True)
# print(tokenizer([tokenized_dataset['oov']['en'][0]], return_tensors="pt", padding=True))
print(src_text)
print(tokens)

# tok_src = {k: v.to('cuda') for k, v in tokenizer(tokenized_dataset['oov']['en'], return_tensors="pt", padding=True).items()}

# translated = [prediction_model.generate(**tokenizer([t], return_tensors="pt", padding=True), max_length=500) for t in tokenized_dataset['oov']['en']]
translated = prediction_model.generate(**tokens, max_length=256)

# [tokenizer.decode(t, skip_special_tokens=True) for t in translated.tolist()]
# translated.tolist()

In [None]:
print(type(translated))

outs_srcs = [tokenizer.decode(t, skip_special_tokens=True) for t in tokenized_dataset['test']['input_ids']]
# outs_lbls = [tokenizer.decode(t, skip_special_tokens=True) for t in translated.predictions]
outs_lbls = [tokenizer.decode(t, skip_special_tokens=True) for t in translated.tolist()]

outs_srcs[:2], outs_lbls[:2]

In [None]:
print(tokenized_dataset['test']['ltl'][2500:2510])
print(outs_lbls[:10])

print(type(tokenized_dataset['test']['ltl']))
print(type(outs_lbls))

metric.compute(references=[[t] for t in tokenized_dataset['test']['ltl'][:100]], predictions=outs_lbls)