In [None]:
from google.colab import files
uploaded = files.upload() # using legal depo data to finetune

In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='dataset.csv')
dataset = dataset["train"].train_test_split(test_size=0.2) # 80-20 split
print(dataset)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

def preprocess_function(examples):
    return tokenizer(examples["text1"], examples["text2"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text1", "text2"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli", num_labels=3) # num_labels = 3 because it is either contradiction, entailment, or neutral

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
model.save_pretrained("./fine_tuned_roberta")
tokenizer.save_pretrained("./fine_tuned_roberta")

In [None]:
from google.colab import files

!zip -r fine_tuned_roberta.zip ./fine_tuned_roberta
files.download("fine_tuned_roberta.zip")

In [None]:
# testing on this one in particular
from transformers import pipeline

nli_pipeline = pipeline("text-classification", model="./fine_tuned_roberta", tokenizer="./fine_tuned_roberta")

result = nli_pipeline({"text": "I was at home all day.", "text_pair": "I went to the store in the afternoon."})
print(result)