## Imports

In [1]:
import os
import nltk
from datasets import DatasetDict
from sklearn.feature_extraction import text
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from transformers import TrainerCallback
from transformers import DataCollatorWithPadding


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
if torch.cuda.is_available():
    print("CUDA is available! ")
else:
    print("CUDA is not available.")
    
print(torch.version.cuda)

CUDA is available! 
12.1


## Load Datasets

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your data
data = pd.read_json('../data/data.jsonl', lines=True)
test_data = pd.read_json('../data/test_final.jsonl', lines=True)
train_data = pd.read_json('../data/train_final.jsonl', lines=True)
validation_data = pd.read_json('../data/validation_final.jsonl', lines=True)

# Remove duplicates
test_data = test_data.drop_duplicates(subset=['text'])
train_data = train_data.drop_duplicates(subset=['text'])
validation_data = validation_data.drop_duplicates(subset=['text'])


## Pre-processing

In [79]:
my_stop_words = text.ENGLISH_STOP_WORDS
words_to_keep = frozenset(['no', 'couldnt', 'cry', 'not', 'cant', 'cannot', 'nor', 'except', 'nobody',
                           'off', 'but', 'serious', 'enough', 'nothing', 'alone', 'down', 'only', 'without'])
my_stop_words = my_stop_words - words_to_keep

def pre_process_data(dataset):
    # Remove stop words
    dataset['text'] = dataset['text'].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in my_stop_words])
    )
    return dataset


train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)
test_data = pre_process_data(test_data)


In [80]:
test_data

Unnamed: 0,text,label
0,feel awful s job position succeed just didn t ...,0
1,im alone feel awful,0
2,ive probably mentioned but really feel proud a...,1
3,feeling little low days,0
4,beleive sensitive peoples feelings tend compas...,2
...,...,...
5395,feel grumpy haven t yoga ed days,3
5396,read blog suburb direction mentioned casually ...,3
5397,not feel things realize violent physical suffe...,3
5398,feel petty silly giving shit but little things...,3


## Tokenization Using pre-trained model Tokenizer:
The model tokenizer requires the data to be in a specific format

In [86]:
# Initialize the model tokenizer and model

# model_name = "bert-base-uncased"
# model_name = "distilbert-base-uncased"
model_name = "roberta-base"
# model_name = "microsoft/deberta-v3-base"

# model_name = f"./roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

# Tokenize using BERT tokenizer
def tokenize_data(texts):
    return tokenizer(texts, padding='max_length', return_tensors='pt', truncation=True, max_length=128).to(device)

train_encodings = tokenize_data(train_data['text'].tolist())
val_encodings = tokenize_data(validation_data['text'].tolist())
test_encodings = tokenize_data(test_data['text'].tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
test_data['text'].tolist()

['feel awful s job position succeed just didn t happen',
 'im alone feel awful',
 'ive probably mentioned but really feel proud actually keeping new years resolution monthly weekly goals',
 'feeling little low days',
 'beleive sensitive peoples feelings tend compassionate',
 'frustrated christians feel constantly talk loving praying seen not case',
 'people feels like going gym only worthwhile hour',
 'feel especially pleased long time coming',
 'struggling awful feelings saying sweet things not deserving sisters friendship agreed car just starting drive away reached hand',
 'feel enraged but helpless time',
 'said feeling bit rebellious',
 'feel disillusioned claimed value truth fraud',
 'mean stupid trip making great album things going feel ecstatic',
 'woke feeling particularly vile tried ignore but got worse worse worse',
 'feel vile moth burrowing way brain seeking brain means control enslave just nasty bug things did chekov star trek wrath khan',
 'know just doing job doesnt actu

In [88]:
test_encodings

{'input_ids': tensor([[    0, 35702, 11522,  ...,     1,     1,     1],
        [    0,   757,  1937,  ...,     1,     1,     1],
        [    0,  2088,  1153,  ...,     1,     1,     1],
        ...,
        [    0,  3654,   619,  ...,     1,     1,     1],
        [    0, 35702, 25070,  ...,     1,     1,     1],
        [    0, 40451,  2157,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [14]:
model_not_finetuned_name = "roberta-base"
model_not_finetuned = AutoModelForSequenceClassification.from_pretrained(model_not_finetuned_name, num_labels=6).to(device).to(device)

## Create Dataset Objects

In [89]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset: TextDataset = TextDataset(train_encodings, train_data['label'].tolist())
val_dataset: TextDataset = TextDataset(val_encodings, validation_data['label'].tolist())
test_dataset: TextDataset = TextDataset(test_encodings, test_data['label'].tolist())


## Training

#### Function to compute the training metrics

In [90]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


#### Function that saves the training stats of the model to a file for further comparison

In [91]:
class TrainingStatsCallback(TrainerCallback):
    """A callback that logs and stores the progress of training."""
    def __init__(self):
        super().__init__()
        self.metrics_df = pd.DataFrame()
        self.output_dir = './training_stats'
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def on_epoch_end(self, args, state, control, **kwargs):
        # Assuming `metrics` is passed as part of kwargs and is a dictionary
        metrics = kwargs.get('metrics')
        new_record = pd.DataFrame([metrics])
        self.metrics_df = pd.concat([self.metrics_df, new_record], ignore_index=True)
        self.metrics_df.to_csv(os.path.join(self.output_dir, 'metrics.csv'), index=False)



#### Training hyppertunning

In [10]:
# import optuna
# from transformers import Trainer, TrainingArguments

# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

# def objective(trial):
#     # Hyperparameters to tune
#     learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
#     num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
#     per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    
#     # Define TrainingArguments with hyperparameters from the trial
#     training_args = TrainingArguments(
#         output_dir="./results",
#         learning_rate=learning_rate,
#         per_device_train_batch_size=per_device_train_batch_size,
#         num_train_epochs=num_train_epochs,
#         weight_decay=0.01,
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         load_best_model_at_end=True,
#         metric_for_best_model="accuracy",
#     )
    
#     trainer = Trainer(
#         model_init=model_init,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset,
#         tokenizer=tokenizer,
#         compute_metrics=compute_metrics
#     )
    
#     trainer.train()
#     eval_result = trainer.evaluate()
    
#     # Optuna aims to minimize the objective, so if accuracy is the metric, return 1 - accuracy
#     return 1 - eval_result["eval_accuracy"]


# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=20)

# print("Best trial:")
# trial_ = study.best_trial

# print(f"  Value: {trial_.value}")
# print("  Params: ")
# for key, value in trial_.params.items():
#     print(f"    {key}: {value}")


#### Training the model

In [92]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [93]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

print("Training labels range: ", min(train_data['label']), "to", max(train_data['label']))
print("Validation labels range: ", min(validation_data['label']), "to", max(validation_data['label']))
print("Test labels range: ", min(test_data['label']), "to", max(test_data['label']))

print("Training data NaN values:", train_data.isnull().values.any())
print("Validation data NaN values:", validation_data.isnull().values.any())
print("Test data NaN values:", test_data.isnull().values.any())

 # Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TrainingStatsCallback]
)

print(trainer)

Training labels range:  0 to 5
Validation labels range:  0 to 5
Test labels range:  0 to 5
Training data NaN values: False
Validation data NaN values: False
Test data NaN values: False
<transformers.trainer.Trainer object at 0x0000020122DEF070>


In [ ]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss


In [42]:
trainer.evaluate()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.20716449618339539,
 'eval_accuracy': 0.9303445720637273,
 'eval_f1': 0.9315931941911905,
 'eval_precision': 0.9380685377892941,
 'eval_recall': 0.9303445720637273,
 'eval_runtime': 111.9461,
 'eval_samples_per_second': 48.22,
 'eval_steps_per_second': 0.759}

## Training Evaluation

In [15]:
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

                                                     
  2%|▏         | 500/21304 [11:07<7:40:03,  1.33s/it]

{'loss': 1.7966, 'grad_norm': 1.9971829652786255, 'learning_rate': 1.9530604581299287e-05, 'epoch': 0.19}


                                                      
  5%|▍         | 1000/21304 [22:07<7:32:30,  1.34s/it]

{'loss': 1.7285, 'grad_norm': 5.741614818572998, 'learning_rate': 1.9061209162598577e-05, 'epoch': 0.38}


                                                      
  7%|▋         | 1500/21304 [33:16<7:17:21,  1.33s/it]

{'loss': 1.4968, 'grad_norm': 5.530463695526123, 'learning_rate': 1.8591813743897863e-05, 'epoch': 0.56}


  9%|▉         | 1909/21304 [42:33<7:43:33,  1.43s/it]

### Save the model

In [12]:
model_path = f'./my_trained_models/{model_name}'
trainer.save_model(model_path)

## Domain adaptation

In [316]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "bert-base-uncased"
model_m = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model_m.to(device)

initial_model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)  # for comparisons
initial_model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model t

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [318]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [244]:
import torch

text = "I am feeling [MASK] right now, because of the weather."
inputs = tokenizer(text, return_tensors="pt").to(device)
token_logits = model_m(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> I am feeling weird right now, because of the weather.'
'>>> I am feeling strange right now, because of the weather.'
'>>> I am feeling better right now, because of the weather.'
'>>> I am feeling bad right now, because of the weather.'
'>>> I am feeling uncomfortable right now, because of the weather.'


In [319]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_data),
    "validation": Dataset.from_pandas(validation_data),
    "test": Dataset.from_pandas(test_data)
})

In [320]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    # result = tokenizer(examples["text"], padding=True, truncation=True, return_tensors='pt').to(device)  
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/42607 [00:00<?, ? examples/s]

Map:   0%|          | 0/5398 [00:00<?, ? examples/s]

Map:   0%|          | 0/5396 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 42607
    })
    validation: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 5398
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 5396
    })
})

In [13]:
tokenizer.model_max_length

512

In [321]:
chunk_size = 32

In [322]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()
                             if k != '__index_level_0__'}
    
    concatenated_examples['__index_level_0__'] = examples['__index_level_0__']
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [323]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/42607 [00:00<?, ? examples/s]

Map:   0%|          | 0/5398 [00:00<?, ? examples/s]

Map:   0%|          | 0/5396 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1320
    })
    validation: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 167
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 167
    })
})

In [324]:
lm_datasets = lm_datasets.remove_columns(["__index_level_0__", "token_type_ids"])

In [325]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [326]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels
        
    res = default_data_collator(features)
    return res

In [329]:
# This was the method used, not the for loops that will appear below

from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
# logging_steps = 1
model_name = f"{model_checkpoint}-wwm"

training_args = TrainingArguments(
    output_dir=f"{model_checkpoint}-wmm",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    num_train_epochs=8,
)

In [None]:
# load local model

trainer = Trainer(
    model=model_m,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    test_dataset=lm_datasets["test"],
    # data_collator=data_collator,
    data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
)

In [265]:
trainer.train()

In [266]:
trainer.evaluate()

In [330]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # masked_inputs = whole_word_masking_data_collator(features)
    
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [332]:
lm_datasets = lm_datasets.remove_columns(["word_ids"])
eval_dataset = lm_datasets["validation"].map(
    insert_random_mask,
    batched=True,
    remove_columns=lm_datasets["validation"].column_names,
)

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

In [333]:
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

In [334]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 167
})

In [353]:
from transformers.data.data_collator import torch_default_data_collator
from torch.utils.data import DataLoader
from transformers import default_data_collator

collator = default_data_collator
# collator = whole_word_masking_data_collator

batch_size = 64
train_dataloader = DataLoader(
    lm_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [354]:
# lm_datasets["validation"].features
print(train_dataloader.dataset)
print(eval_dataloader.dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1320
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 167
})


In [349]:
from torch.optim import AdamW

optimizer = AdamW(model_m.parameters(), lr=4e-5)

In [355]:
from transformers import get_scheduler

num_train_epochs = 8
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [357]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))
output_dir = f"{model_name}-wwm"

# this was not the method used for training but the results were essentially the same?? But I had some doubts about the loss/perplexity here
for epoch in range(num_train_epochs):
    # Training
    model_m.train()
    training_losses = []
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model_m(**batch)
        loss = outputs.loss
        
        training_losses.append(loss.repeat(batch_size)) 
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    training_losses = torch.cat(training_losses)  # this seems useless and confusing, but wtv
    training_losses = training_losses[: len(train_dataloader.dataset)]
    try:
        training_perplexity = math.exp(torch.mean(training_losses))
    except OverflowError:
        training_perplexity = float("inf")
    print(f">>> Epoch {epoch + 1}: Training Perplexity: {training_perplexity}")
        

    # Evaluation
    model_m.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model_m(**batch)

        loss:torch.Tensor = outputs.loss
        losses.append(loss.repeat(batch_size))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch + 1}: Perplexity: {perplexity}")


  0%|          | 0/168 [00:00<?, ?it/s]

>>> Epoch 1: Training Perplexity: 1.0016015851285005
>>> Epoch 1: Perplexity: 28291.66003866716
>>> Epoch 2: Training Perplexity: 1.0017305385592177
>>> Epoch 2: Perplexity: 18920.612252451443
>>> Epoch 3: Training Perplexity: 1.0008708001809226
>>> Epoch 3: Perplexity: 30854.916717727923
>>> Epoch 4: Training Perplexity: 1.0004474036828919
>>> Epoch 4: Perplexity: 33737.87399734711
>>> Epoch 5: Training Perplexity: 1.0003026585428925
>>> Epoch 5: Perplexity: 38197.62142543441
>>> Epoch 6: Training Perplexity: 1.0003180343121076
>>> Epoch 6: Perplexity: 48887.474472099966
>>> Epoch 7: Training Perplexity: 1.0002184581205678
>>> Epoch 7: Perplexity: 50281.94303011594
>>> Epoch 8: Training Perplexity: 1.0002276090000621
>>> Epoch 8: Perplexity: 49376.646908227136


In [358]:
# test the model
# use the test dataset and the whole_word_masking_data_collator
test_dataset = lm_datasets["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=lm_datasets["test"].column_names,
)

test_dataset = test_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    })

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

# test_dataloader = accelerator.prepare(test_dataloader)

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

In [296]:
test_dataloader.dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 167
})

In [359]:
import math
losses = []
for step, batch in enumerate(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model_m(**batch)

    loss = outputs.loss
    losses.append(loss.repeat(batch_size))

In [360]:
losses = torch.cat(losses)
losses = losses[: len(test_dataset)]
try:
    perplexity = math.exp(torch.mean(losses))
except OverflowError:
    perplexity = float("inf")


In [361]:
print(f"Test Perplexity: {perplexity}")

Test Perplexity: 54452.582913266815


In [362]:
# test on some input
text = "I am feeling [MASK] right now, because of the weather."
inputs = tokenizer(text, return_tensors="pt").to(device)
token_logits = model_m(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> I am feeling better right now, because of the weather.'
'>>> I am feeling bad right now, because of the weather.'
'>>> I am feeling good right now, because of the weather.'
'>>> I am feeling sick right now, because of the weather.'
'>>> I am feeling strange right now, because of the weather.'


In [363]:
# save model_m


In [12]:
test_data

Unnamed: 0,text,label
0,feel awful s job position succeed just didn t ...,0
1,im alone feel awful,0
2,ive probably mentioned but really feel proud a...,1
3,feeling little low days,0
4,beleive sensitive peoples feelings tend compas...,2
...,...,...
5395,feel grumpy haven t yoga ed days,3
5396,read blog suburb direction mentioned casually ...,3
5397,not feel things realize violent physical suffe...,3
5398,feel petty silly giving shit but little things...,3


## Model Evaluation

In [62]:
y2= []
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# ' '.join(x)
for p in test_data['text'][:5]:
    print(' '.join(p))
    ti = tokenizer(' '.join(p),padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():  # Deactivate gradients for the following code
        out = model(**ti)
        
    pred = torch.argmax(out.logits)
    print(out)
    print(pred)
    y2.append(pred)

f e e l   a w f u l   s   j o b   p o s i t i o n   s u c c e e d   j u s t   d i d n   t   h a p p e n
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.8039, -1.5189, -1.9315,  0.1075, -3.1939, -1.8422]],
       device='cuda:0'), hidden_states=None, attentions=None)
tensor(0, device='cuda:0')
i m   a l o n e   f e e l   a w f u l
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.9390, -1.8971, -1.9743, -0.2243, -2.3850, -1.9237]],
       device='cuda:0'), hidden_states=None, attentions=None)
tensor(0, device='cuda:0')
i v e   p r o b a b l y   m e n t i o n e d   b u t   r e a l l y   f e e l   p r o u d   a c t u a l l y   k e e p i n g   n e w   y e a r s   r e s o l u t i o n   m o n t h l y   w e e k l y   g o a l s
SequenceClassifierOutput(loss=None, logits=tensor([[-2.0124,  7.0101,  0.3619, -2.2290, -2.1821, -1.2951]],
       device='cuda:0'), hidden_states=None, attentions=None)
tensor(1, device='cuda:0')
f e e l i n g   l i t t l e   l o w   d a y s
SequenceClassif

In [45]:
preds = trainer.predict(test_dataset)

In [47]:
preds.predictions

array([[ 6.8038945 , -1.518913  , -1.931491  ,  0.10749761, -3.1938698 ,
        -1.8422035 ],
       [ 6.938984  , -1.8970717 , -1.9743444 , -0.22429684, -2.3849494 ,
        -1.9237057 ],
       [-2.0124316 ,  7.0101357 ,  0.36193562, -2.2289808 , -2.1821327 ,
        -1.2951416 ],
       ...,
       [-1.3564557 , -0.8732065 , -1.2695175 ,  7.1644497 , -1.0327799 ,
        -2.229077  ],
       [-1.2067674 , -1.1778868 , -0.94585097,  6.913796  , -1.2110066 ,
        -1.8400692 ],
       [-1.2924892 , -1.9809064 , -1.2988603 ,  7.1253195 , -0.12671858,
        -2.079677  ]], dtype=float32)

In [63]:
y_pred= []
for p in test_data['text']:
    ti = tokenizer(' '.join(p), return_tensors="pt").to(device)
    out = model(**ti)
    pred = torch.argmax(out.logits)
    y_pred.append(pred)

In [19]:
y_pred

[tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(4, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, device='cuda:0'),
 tensor(3, dev

In [20]:
y_test

0       0
1       0
2       1
3       0
4       2
       ..
5395    3
5396    3
5397    3
5398    3
5399    3
Name: label, Length: 5396, dtype: int64

In [65]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# to tensor
y_test = test_data['label'].tolist()
# plain list of numbers
# y_pred_np = [int(i) for i in y_pred]
y_pred_np = [int(i) for i in preds.predictions.argmax(axis=1)] 
# y_test.append(2)
# y_pred_np.append(2)
print(confusion_matrix(y_test, y_pred_np))
print('Accuracy: ', accuracy_score(y_test, y_pred_np))
print('Precision: ', precision_score(y_test, y_pred_np, average='macro'))
print('Recall: ', recall_score(y_test, y_pred_np, average='macro'))
print('F1: ', f1_score(y_test, y_pred_np, average='macro'))

[[1466   20    4   42   32    2]
 [  47 1621  119   30    8   10]
 [   4    0  424    1    0    2]
 [   6    3    1  722   21    2]
 [   1    2    1   12  564   13]
 [   0    0    0    1    0  215]]
Accuracy:  0.9288361749444033
Precision:  0.8993631860303525
Recall:  0.9510063948994584
F1:  0.9217117252272052


In [31]:
max(y_pred_np)

5

In [29]:
y_test

[0,
 0,
 1,
 0,
 2,
 2,
 1,
 1,
 1,
 3,
 3,
 0,
 1,
 3,
 3,
 1,
 0,
 1,
 2,
 0,
 0,
 4,
 0,
 2,
 3,
 1,
 1,
 0,
 0,
 1,
 5,
 4,
 1,
 1,
 2,
 0,
 1,
 4,
 1,
 4,
 0,
 1,
 0,
 1,
 3,
 1,
 1,
 1,
 3,
 1,
 2,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 5,
 0,
 5,
 0,
 1,
 5,
 0,
 4,
 1,
 1,
 3,
 0,
 3,
 2,
 1,
 4,
 1,
 4,
 1,
 0,
 0,
 1,
 4,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 3,
 4,
 1,
 3,
 0,
 0,
 1,
 0,
 1,
 0,
 4,
 0,
 1,
 0,
 1,
 0,
 3,
 4,
 1,
 4,
 4,
 1,
 1,
 1,
 1,
 0,
 5,
 1,
 2,
 3,
 0,
 3,
 1,
 0,
 0,
 1,
 0,
 2,
 5,
 5,
 1,
 1,
 1,
 1,
 0,
 2,
 4,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 4,
 3,
 0,
 1,
 1,
 4,
 1,
 1,
 3,
 1,
 1,
 4,
 1,
 2,
 1,
 0,
 4,
 1,
 1,
 4,
 0,
 1,
 1,
 1,
 3,
 1,
 0,
 3,
 3,
 1,
 4,
 2,
 1,
 1,
 0,
 1,
 1,
 4,
 1,
 4,
 1,
 1,
 0,
 4,
 1,
 1,
 1,
 1,
 0,
 4,
 1,
 0,
 1,
 1,
 0,
 4,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 2,
 3,
 1,
 0,
 0,
 4,
 0,
 0,
 1,
 0,
 4,
 1,
 0,
 1,
 1,
 0,
 1,
 3,
 0,
 1,
 0,
 2,
 1,
 4,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 3,
 3,
 0,
 4,
 1,
 4,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 0,


In [32]:
y_pred_np

[3,
 3,
 1,
 1,
 3,
 1,
 1,
 3,
 1,
 3,
 3,
 1,
 3,
 1,
 1,
 1,
 4,
 1,
 3,
 1,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 1,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 1,
 4,
 3,
 3,
 0,
 1,
 3,
 0,
 0,
 3,
 3,
 1,
 3,
 1,
 3,
 3,
 1,
 0,
 3,
 3,
 3,
 1,
 3,
 1,
 3,
 3,
 0,
 3,
 1,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 3,
 0,
 1,
 3,
 3,
 0,
 3,
 3,
 3,
 0,
 3,
 3,
 1,
 3,
 0,
 3,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 3,
 1,
 0,
 3,
 3,
 1,
 3,
 3,
 1,
 1,
 1,
 1,
 3,
 0,
 1,
 3,
 1,
 1,
 3,
 3,
 1,
 0,
 1,
 3,
 1,
 1,
 1,
 3,
 3,
 0,
 0,
 0,
 0,
 1,
 3,
 1,
 1,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 0,
 1,
 3,
 0,
 0,
 1,
 3,
 3,
 3,
 1,
 3,
 1,
 3,
 1,
 1,
 1,
 3,
 1,
 0,
 4,
 0,
 0,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 3,
 0,
 3,
 1,
 3,
 3,
 1,
 3,
 1,
 3,
 0,
 1,
 1,
 3,
 1,
 1,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 4,
 3,
 1,
 1,
 3,
 4,
 1,
 1,
 3,
 1,
 3,
 1,
 0,
 3,
 1,
 1,
 0,
 3,
 3,
 1,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,


In [33]:
set(y_test) - set(y_pred_np)

{2}

In [74]:
# apply the data collator to the first 3 examples
d = data_collator(test_dataset[:3])


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [75]:
d

{'input_ids': tensor([[   0,  506,  364,  364,  784, 1437, 1437,   10,  885,  856, 1717,  784,
         1437, 1437,  579, 1437, 1437, 1236, 1021,  741, 1437, 1437,  181, 1021,
          579,  939,  326,  939, 1021,  295, 1437, 1437,  579, 1717,  740,  740,
          364,  364,  385, 1437, 1437, 1236, 1717,  579,  326, 1437, 1437,  385,
          939,  385,  295, 1437, 1437,  326, 1437, 1437, 1368,   10,  181,  181,
          364,  295,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [   0,  118,  475, 1437, 1437,   10,  784, 1021,  295,  364, 1437, 1437,
          856,  364,  364,  784, 1437