In [None]:
import datetime
import os
import statistics
import time
from argparse import ArgumentParser
import datasets
import torch
import random
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [None]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

# Any HuggingFace model (https://huggingface.co/models) works 
model = "bert-base-cased"

# Directory where to save different training runs
output_dir = "./logs/runs/"

# Directory where data is stored
data_dir = "../../data/"
assert os.path.exists(data_dir)

# Cut or pad sequences to this length
max_len = 64
assert isinstance(max_len, int)

# Lowercase text (this should match the model used; i.e., when using bert-base-cased set this to True)
lowercase = False
assert isinstance(lowercase, bool)

# Other training-specific hyperparameters
gradient_accumulation_steps = 1
learning_rate = 5e-5
weight_decay = 0.0
adam_epsilon = 1e-8
max_grad_norm = 1.0
random_weights = False
batch_size = 5
epochs = 5
max_steps = -1
warmup_steps = 0
early_stop = False
eval_steps = 100
num_labels = 2
# Strategy for saving model. Possible values: ["no", "epoch", "steps"]. See https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments.save_strategy
save_strategy = "no"
load_best_model_at_end = True  # This can only be true if save strategy is not "no"

# Set seed for reproducibility
seed = 42

# ================ End ================

In [None]:
def make_output_dir(output_dir):
    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
    output_dir = os.path.join(output_dir, timestamp)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    return output_dir

In [None]:
class Dataset:
    def __init__(self, dataset, lowercase=False, batch_size=16, max_len=64):
        self.dataset = dataset
        self.lowercase = lowercase
        self.batch_size = batch_size
        self.max_len = max_len

    def _encode(self, example):
        return self.tokenizer(
            example["text"],
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
        )

    def format(self, dataset):
        dataset = dataset.map(self._encode, batched=True)
        dataset.set_format(
            type="torch",
            columns=["input_ids", "token_type_ids", "attention_mask", "label"],
        )
        return dataset

    def format_data(self, tokenizer, batch_size=None):
        print("Formatting data...")
        self.tokenizer = tokenizer
        if batch_size:
            self.batch_size = batch_size

        self.train_dataset = self.format(self.dataset["train"])
        # self.validation_dataset = self.format(self.dataset["validation"])
        self.test_dataset = self.format(self.dataset["test"])
        print("Done formatting.")

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def compute_metrics(p, average="macro"):
    pred, true = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=true, y_pred=pred)
    recall = recall_score(y_true=true, y_pred=pred, average=average)
    precision = precision_score(y_true=true, y_pred=pred, average=average)
    f1 = f1_score(y_true=true, y_pred=pred, average=average)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1-score": f1,
    }


class TextClassifier:
    def __init__(
        self,
        model_name="bert-base-uncased",
        lowercase=False,
        max_len=64,
        num_labels=2,
        output_dir="",
        seed=None,
    ):
        self.output_dir = output_dir

        # Model parameters
        self.model_name = model_name
        self.do_lower_case = lowercase
        self.max_len = max_len
        self.num_labels = num_labels
        
        # Set seed for reproducibility
        if seed:
            set_seed(seed)

        self.model_init()

    def model_init(self):
        print("Loading pre-trained tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)

        self.tokenizer.do_lower_case = self.do_lower_case
        self.tokenizer.model_max_length = self.max_len
        print("Done.")

        self.model_config = AutoConfig.from_pretrained(self.model_name)
        self.model_config.num_labels = self.num_labels

        print("Loading pre-trained model...")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, config=self.model_config
        )
        print(f"Loaded model from path: {self.model_name}")

    def save_model(self, save_path=None):
        save_dir = save_path if save_path else self.output_dir
        if not os.path.isdir(save_dir):
            save_dir.mkdir(parents=True)
        torch.save(self.model, save_dir)
        print(f"Saved model to path: {save_dir}")

    def load_model(self, path):
        if os.path.exists(path):
            print("Loading local model state dict...")
            self.model.load_state_dict(torch.load(path))
            print(f"Loaded model from path: {path}")
        else:
            print(f"Model path does not exist: {path}")
            raise Exception(f"The specified file path ({path}) does not exist!")


class TextClassifierTrainer(TextClassifier):
    def __init__(
        self,
        output_dir="",
        # bfloat16=False,
        epochs=5,
        batch_size=16,
        warmup_steps=500,
        weight_decay=0.9,
        random_weights=False,
        load_best_model_at_end=False,
        early_stop=False,
        eval_steps=500,
        seed=42,
        save_strategy="no",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.output_dir = output_dir
        self.logging_dir = os.path.join(output_dir, "logs")

        # Model parameters
        self.epochs = epochs
        self.batch_size = batch_size
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.load_best_model_at_end = load_best_model_at_end if save_strategy != "no" else False
        self.random_weights = random_weights
        self.early_stop = early_stop
        self.eval_steps = eval_steps
        self.save_strategy = save_strategy

        self.seed = seed

    def train(self, train_dataset, eval_dataset):

        ### Training
        print("Initializing trainer...")

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size * 4,
            warmup_steps=self.warmup_steps,
            weight_decay=self.weight_decay,
            logging_dir=self.logging_dir,
            logging_strategy="steps",
            logging_steps=self.eval_steps,
            evaluation_strategy="steps",
            eval_steps=self.eval_steps,
            load_best_model_at_end=self.load_best_model_at_end,
            save_strategy=self.save_strategy,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
        )
        print("Trainer initialized.")
        print("Training...")
        trainer.train()
        print("Done training.")

        ### Validation
        print("Evaluating...")
        res = trainer.evaluate()
        print("Evaluation results:")
        print(
            f'Eval loss: \t{res["eval_loss"]}, Eval Acc: \t{res["eval_accuracy"]}, Eval P: \t{res["eval_precision"]}, Eval R: \t{res["eval_recall"]}, Eval F1: \t{res["eval_f1-score"]}'
        )
        print("Done evaluating.")

        if self.save_strategy != "no":
            trainer.save_model()
            trainer.save_state()
            self.trained_model_path = os.path.join(self.output_dir, "pytorch_model.bin")
            assert self.trained_model_path.is_file()
            print(f"Saved model to path: {self.output_dir}")

        return res

In [None]:
def train(args):
    # _filename = "variable_detection_german_dev.tsv" if args.data_lang == "de" else "variable_detection_dev.tsv"
    # data_path = os.path.join(args.data_dir, _filename)
    try:
        assert os.path.exists(args.train_path)
    except:
        raise Exception(f"Failed to load: {args.train_path}")

    train_df = pd.read_csv(args.train_path, sep="\t")
    train_df.rename(columns={"is_variable": "label"}, inplace=True)

    X_idx = train_df.index.to_numpy()
    y = train_df.label.to_numpy()

    kf = KFold(n_splits=10)
    kf.get_n_splits(X_idx)

    results = {}

    # Train model with cross-validation
    print("Training models with cross-validation...")
    for i, (train_index, test_index) in enumerate(tqdm(kf.split(X_idx))):
        train_dataset = datasets.Dataset.from_pandas(train_df.iloc[train_index])
        test_dataset = datasets.Dataset.from_pandas(train_df.iloc[test_index])

        run_output_dir = args.output_dir
        if args.save_strategy != "no":
            run_output_dir = make_output_dir(args.output_dir)

        trainer = TextClassifierTrainer(
            model_name=args.model,
            lowercase=args.lowercase,
            epochs=args.epochs,
            batch_size=args.batch_size,
            output_dir=run_output_dir,
            warmup_steps=args.warmup_steps,
            weight_decay=args.weight_decay,
            load_best_model_at_end=args.load_best_model_at_end,
            random_weights=args.random_weights,
            early_stop=args.early_stop,
            eval_steps=args.eval_steps,
            save_strategy=args.save_strategy,
            seed=args.seed,
        )

        dataset = Dataset({"train": train_dataset, "test": test_dataset})
        dataset.format_data(trainer.tokenizer)

        result = trainer.train(dataset.train_dataset, dataset.test_dataset)

        for k, v in result.items():
            if k in results:
                results[k] += [v]
            else:
                results[k] = [v]

    # Compute mean and standard deviation
    print("***** Cross-Validation Results *****")
    for k, v in results.items():
        skip = True
        for m in ["accuracy", "precision", "recall", "f1-score"]:
            if m in k:
                skip = False
        if skip:
            continue
        mean, std, pstd = (
            statistics.mean(v),
            statistics.stdev(v),
            statistics.pstdev(v),
        )
        print(
            k + ":\n",
            "Mean:",
            round(mean, 4),
            "\tStd.:",
            round(std, 4),
            "\tPStd:",
            round(pstd, 4),
        )

In [None]:
# Set hyperparameters

class Args:
    model = model
    output_dir = output_dir
    data_dir = data_dir
    data_lang = lang
    max_len = max_len
    lowercase = lowercase
    gradient_accumulation_steps = gradient_accumulation_steps
    learning_rate = learning_rate
    weight_decay = weight_decay
    random_weights = random_weights
    adam_epsilon = adam_epsilon
    max_grad_norm = max_grad_norm
    batch_size = batch_size
    epochs = epochs
    max_steps = max_steps
    warmup_steps = warmup_steps
    early_stop = early_stop
    eval_steps = eval_steps
    num_labels = num_labels
    seed = seed
    save_strategy = save_strategy
    load_best_model_at_end = load_best_model_at_end

args = Args()

In [None]:
data_dir

In [None]:
args.train_path = os.path.join(data_dir, "trial", "train", lang+".tsv")

In [None]:
# Run training

train(args)