In [2]:
import pandas as pd
import numpy as np

import os
cwd = os.getcwd()
kaggle = cwd == "/kaggle/working"

# pretrain = pd.read_parquet("pretrain.parquet")
# train = pd.read_parquet("train.parquet")
# test = pd.read_parquet("test.parquet")

pretrain = pd.read_parquet(("/kaggle/input/latsis-experiments/" if kaggle else "") + "pretrain.parquet")
train = pd.read_parquet(("/kaggle/input/latsis-experiments/" if kaggle else "") + "train.parquet")
test = pd.read_parquet(("/kaggle/input/latsis-experiments/" if kaggle else "") + "test.parquet")

#convert to string
train["text"] = train["text"].astype(str)
test["text"] = test["text"].astype(str)

#keep only the first 1000 rows
pretrain = pretrain[:1000]
train = train[:1000]
test = test[:1000]

In [3]:
import torch
from torch import nn
from transformers import TrainingArguments, Trainer, AutoTokenizer, XLMRobertaTokenizerFast, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=1)
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
config = model.config
tokenizer.model_max_length = config.max_position_embeddings
model.classifier.out_proj = nn.Sequential(
    nn.Linear(config.hidden_size, 1),
    nn.Sigmoid()
)


# tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels=1, ignore_mismatched_sizes=True)




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin c:\ProgramData\Anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda116_nocublaslt.dll
CUDA SETUP: CUDA runtime path found: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary c:\ProgramData\Anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda116_nocublaslt.dll...


  warn(msg)
  warn(msg)
  warn(msg)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# for param in model.parameters():
#     param.requires_grad = False
# for param in model.classifier.parameters():
#     param.requires_grad = True

# for param in model.roberta.encoder.layer[-1].parameters():
#     param.requires_grad = True
# for param in model.roberta.encoder.layer[-2].parameters():
#     param.requires_grad = True


from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaLayer

# Get the configuration of one existing layer to use as a template
template_config = model.config

# Manually create new layers using the template configuration
new_layers = [XLMRobertaLayer(template_config) for _ in range(2)]

# Initialize the new layers with random weights
for layer in new_layers:
    layer.apply(model._init_weights)

# Append new layers to the existing stack of layers
model.roberta.encoder.layer.extend(new_layers)

# Update the config to reflect the new number of layers
model.config.num_hidden_layers += 2

# Freeze the original layers
for param in model.roberta.encoder.layer[:-2].parameters():
    param.requires_grad = False

# Check the architecture (should now have 14 layers in the encoder)
# print(model)


In [5]:
from torch.utils.data import Dataset
import torch
import numpy as np

max_length = 128

def encode_texts(tokenizer, texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

pretrain_x, pretrain_attention_mask = encode_texts(tokenizer, pretrain['text'])
pretrain_y = torch.tensor(np.array(pretrain['label'].tolist()), dtype=torch.float32)

train_x, train_attention_mask = encode_texts(tokenizer, train['text'])
train_y = torch.tensor(np.array(train['label'].tolist()), dtype=torch.float32)

test_x, test_attention_mask = encode_texts(tokenizer, test['text'])
test_y = torch.tensor(np.array(test['label'].tolist()), dtype=torch.float32)



class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label = label

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'label': self.label[idx],
        }

pretrain_dataset = CustomDataset(pretrain_x, pretrain_attention_mask, pretrain_y)
train_dataset = CustomDataset(train_x, train_attention_mask, train_y)
val_dataset = CustomDataset(test_x, test_attention_mask, test_y)

print(train_x.shape, train_attention_mask.shape, train_y.shape, test_x.shape, test_attention_mask.shape, test_y.shape, pretrain_x.shape, pretrain_attention_mask.shape, pretrain_y.shape)


torch.Size([1000, 128]) torch.Size([1000, 128]) torch.Size([1000]) torch.Size([1000, 128]) torch.Size([1000, 128]) torch.Size([1000]) torch.Size([1000, 128]) torch.Size([1000, 128]) torch.Size([1000])


In [6]:
from datasets import load_metric

def compute_metrics(p):
    metric = load_metric("accuracy")
    metric_f1 = load_metric("f1")
    accuracy = metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids)
    f1 = metric_f1.compute(predictions=p.predictions.argmax(-1), references=p.label_ids, average='macro')
    return {"accuracy": accuracy["accuracy"], "f1": f1}

from transformers import AdamW, get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR

class EpochBasedLRScheduler(LambdaLR):
    def __init__(self, optimizer, lr_per_epoch, len_train_loader, last_epoch=-1):
        self.lr_per_epoch = lr_per_epoch
        self.len_train_loader = len_train_loader
        super(EpochBasedLRScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, current_step: int):
        current_epoch = current_step // self.len_train_loader
        if current_epoch < len(self.lr_per_epoch):
            return self.lr_per_epoch[current_epoch] / self.base_lrs[0]
        return 1.0

    
from transformers import TrainerCallback, TrainerControl, TrainingArguments

class PrintLearningRateCallback(TrainerCallback):
    def on_epoch_end(self, args: TrainingArguments, state: TrainerControl, control: TrainerControl, **kwargs):
        lr = state.log_history[-1]['learning_rate']
        print(f"Learning rate at end of epoch {state.epoch}: {lr}")



optimizer = AdamW(model.parameters(), lr=1e-5)



class EpochAverageLossCallback(TrainerCallback):
    def __init__(self):
        self.cumulative_loss = 0.0
        self.batch_count = 0

    def on_train_begin(self, args, state, control, **kwargs):
        # Reset at the beginning of training
        self.cumulative_loss = 0.0
        self.batch_count = 0

    def on_step_end(self, args, state, control, **kwargs):
        # Accumulate losses and increment batch count after each step
        self.cumulative_loss += state.log_history[-1]['loss'] if state.log_history else 0
        self.batch_count += 1

    def on_epoch_end(self, args, state, control, **kwargs):
        # Compute average loss and print it
        average_epoch_loss = self.cumulative_loss / self.batch_count
        print(f"Average training loss over epoch {state.epoch}: {average_epoch_loss:.4f}")

        # Reset for next epoch
        self.cumulative_loss = 0.0
        self.batch_count = 0


lr_per_epoch = [1e-5,1e-5,1e-5]

retrain = True
# retrain = False

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=len(lr_per_epoch),
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=0,
    weight_decay=0.01,
    logging_steps=len(train_dataset) // 4,
    #lr_scheduler_type='constant',
    #learning_rate=1e-9,
    report_to='none',
    evaluation_strategy='epoch',
    save_strategy='no',
)

len_train_loader = len(train_dataset) // training_args.per_device_train_batch_size



def train(model, scheduler, train_dataset):
    global optimizer, training_args, val_dataset
    model.train()
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler),
        # callbacks=[PrintLearningRateCallback(), EpochAverageLossCallback()]
    )
    trainer.train()
    model.eval()

if retrain:
    scheduler = EpochBasedLRScheduler(optimizer, lr_per_epoch, len_train_loader)
    train(model, scheduler, pretrain_dataset)
    scheduler = EpochBasedLRScheduler(optimizer, lr_per_epoch, len_train_loader)
    train(model, scheduler, train_dataset)
    torch.save(model, '/kaggle/working/model.pt' if kaggle else 'model.pt')
    
else:
    model = torch.load('/kaggle/working/model.pt' if kaggle else 'model.pt')
    model.eval()




  0%|          | 0/500 [00:00<?, ?it/s]

{'loss': 0.2218, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

  metric = load_metric("accuracy")


{'eval_loss': 0.23930753767490387, 'eval_accuracy': 0.666, 'eval_f1': {'f1': 0.3997599039615847}, 'eval_runtime': 9.8188, 'eval_samples_per_second': 101.845, 'eval_steps_per_second': 25.461, 'epoch': 1.0}
{'loss': 0.2204, 'learning_rate': 1e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.2321384847164154, 'eval_accuracy': 0.666, 'eval_f1': {'f1': 0.3997599039615847}, 'eval_runtime': 10.1669, 'eval_samples_per_second': 98.358, 'eval_steps_per_second': 24.589, 'epoch': 2.0}
{'train_runtime': 123.6125, 'train_samples_per_second': 16.18, 'train_steps_per_second': 4.045, 'train_loss': 0.22110443878173827, 'epoch': 2.0}
