## Imports

In [2]:
import os
import nltk
from datasets import DatasetDict
from sklearn.feature_extraction import text
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from transformers import TrainerCallback
from transformers import DataCollatorWithPadding


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcarv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# print torch version
print(torch.__version__)


2.3.0+cu121


In [10]:
! pip uninstall simcse

^C


In [13]:
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121


In [4]:
if torch.cuda.is_available():
    print("CUDA is available! ")
else:
    print("CUDA is not available.")
    
print(torch.version.cuda)

CUDA is available! 
12.1


## Load Datasets

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your data
data = pd.read_json('../data/data.jsonl', lines=True)
test_data = pd.read_json('../data/test_final.jsonl', lines=True)
train_data = pd.read_json('../data/train_final.jsonl', lines=True)
validation_data = pd.read_json('../data/validation_final.jsonl', lines=True)

# Remove duplicates
test_data = test_data.drop_duplicates(subset=['text'])
train_data = train_data.drop_duplicates(subset=['text'])
validation_data = validation_data.drop_duplicates(subset=['text'])

#test_data.head()
#train_data["text"].size

## Pre-processing

In [6]:
my_stop_words = text.ENGLISH_STOP_WORDS
words_to_keep = frozenset(['no', 'couldnt', 'cry', 'not', 'cant', 'cannot', 'nor', 'except', 'nobody',
                           'off', 'but', 'serious', 'enough', 'nothing', 'alone', 'down', 'only', 'without'])
my_stop_words = my_stop_words - words_to_keep

def pre_process_data(dataset):
    # Tokenize
    #dataset['text'] = dataset['text'].apply(word_tokenize)
    # Remove stop words
    dataset['text'] = dataset['text'].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in my_stop_words])
    )
    return dataset


train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)
test_data = pre_process_data(test_data)


## Tokenization Using pre-trained model Tokenizer:
The model tokenizer requires the data to be in a specific format

In [8]:
# Initialize the model tokenizer and model

model_name = "bert-base-uncased"
# model_name = "distilbert-base-uncased"
# model_name = "roberta-base"
# model_name = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

# Tokenize using BERT tokenizer
def tokenize_data(texts):
    # return tokenizer(texts, padding='max_length', truncation=True, max_length=128)
    return tokenizer(texts, padding='max_length', return_tensors='pt', truncation=True, max_length=128).to(device)

train_encodings = tokenize_data(train_data['text'].apply(lambda x: ' '.join(x)).tolist())
val_encodings = tokenize_data(validation_data['text'].apply(lambda x: ' '.join(x)).tolist())
test_encodings = tokenize_data(test_data['text'].apply(lambda x: ' '.join(x)).tolist())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create Dataset Objects

In [9]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset: TextDataset = TextDataset(train_encodings, train_data['label'].tolist())
val_dataset: TextDataset = TextDataset(val_encodings, validation_data['label'].tolist())
test_dataset: TextDataset = TextDataset(test_encodings, test_data['label'].tolist())


## Training

#### Function to compute the training metrics

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


#### Function that saves the training stats of the model to a file for further comparison

In [11]:
class TrainingStatsCallback(TrainerCallback):
    """A callback that logs and stores the progress of training."""
    def __init__(self):
        super().__init__()
        self.metrics_df = pd.DataFrame()
        self.output_dir = './training_stats'
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def on_epoch_end(self, args, state, control, **kwargs):
        # Assuming `metrics` is passed as part of kwargs and is a dictionary
        metrics = kwargs.get('metrics')
        new_record = pd.DataFrame([metrics])
        self.metrics_df = pd.concat([self.metrics_df, new_record], ignore_index=True)
        self.metrics_df.to_csv(os.path.join(self.output_dir, 'metrics.csv'), index=False)



#### Training hyppertunning

In [10]:
# import optuna
# from transformers import Trainer, TrainingArguments

# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

# def objective(trial):
#     # Hyperparameters to tune
#     learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
#     num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
#     per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    
#     # Define TrainingArguments with hyperparameters from the trial
#     training_args = TrainingArguments(
#         output_dir="./results",
#         learning_rate=learning_rate,
#         per_device_train_batch_size=per_device_train_batch_size,
#         num_train_epochs=num_train_epochs,
#         weight_decay=0.01,
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         load_best_model_at_end=True,
#         metric_for_best_model="accuracy",
#     )
    
#     trainer = Trainer(
#         model_init=model_init,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset,
#         tokenizer=tokenizer,
#         compute_metrics=compute_metrics
#     )
    
#     trainer.train()
#     eval_result = trainer.evaluate()
    
#     # Optuna aims to minimize the objective, so if accuracy is the metric, return 1 - accuracy
#     return 1 - eval_result["eval_accuracy"]


# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=20)

# print("Best trial:")
# trial_ = study.best_trial

# print(f"  Value: {trial_.value}")
# print("  Params: ")
# for key, value in trial_.params.items():
#     print(f"    {key}: {value}")


#### Training the model

In [14]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

print("Training labels range: ", min(train_data['label']), "to", max(train_data['label']))
print("Validation labels range: ", min(validation_data['label']), "to", max(validation_data['label']))
print("Test labels range: ", min(test_data['label']), "to", max(test_data['label']))

print("Training data NaN values:", train_data.isnull().values.any())
print("Validation data NaN values:", validation_data.isnull().values.any())
print("Test data NaN values:", test_data.isnull().values.any())

 # Initialize Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[TrainingStatsCallback()]
)

print(trainer)

Training labels range:  0 to 5
Validation labels range:  0 to 5
Test labels range:  0 to 5
Training data NaN values: False
Validation data NaN values: False
Test data NaN values: False
<transformers.trainer.Trainer object at 0x00000209D1B430D0>


## Training Evaluation

In [15]:
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

                                                     
  2%|▏         | 500/21304 [11:07<7:40:03,  1.33s/it]

{'loss': 1.7966, 'grad_norm': 1.9971829652786255, 'learning_rate': 1.9530604581299287e-05, 'epoch': 0.19}


                                                      
  5%|▍         | 1000/21304 [22:07<7:32:30,  1.34s/it]

{'loss': 1.7285, 'grad_norm': 5.741614818572998, 'learning_rate': 1.9061209162598577e-05, 'epoch': 0.38}


                                                      
  7%|▋         | 1500/21304 [33:16<7:17:21,  1.33s/it]

{'loss': 1.4968, 'grad_norm': 5.530463695526123, 'learning_rate': 1.8591813743897863e-05, 'epoch': 0.56}


  9%|▉         | 1909/21304 [42:33<7:43:33,  1.43s/it]

### Save the model

In [12]:
# model_path = f'./my_trained_models/{model_name}-full'
# trainer.save_model(model_path)

roberta_path = f"./roberta-base"
# load the model
model = AutoModelForSequenceClassification.from_pretrained(roberta_path)

2

In [35]:
outputs = model(**test_encodings)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2121793536 bytes.

## Domain adaptation

In [12]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "bert-base-uncased"
model_m = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model_m.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
num_parameters = model_m.num_parameters() / 1_000_000
print(f"'>>> number of parameters: {round(num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> number of parameters: 110M'
'>>> BERT number of parameters: 110M'


In [14]:
# text = "This is a great [MASK]."

In [ ]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
# import torch
# 
# inputs = tokenizer(text, return_tensors="pt").to(device)
# token_logits = model_m(**inputs).logits
# # Find the location of [MASK] and extract its logits
# mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# mask_token_logits = token_logits[0, mask_token_index, :]
# # Pick the [MASK] candidates with the highest logits
# top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
# 
# for token in top_5_tokens:
#     print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great idea.'
'>>> This is a great day.'
'>>> This is a great place.'
'>>> This is a great time.'
'>>> This is a great thing.'


In [32]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_data),
    "validation": Dataset.from_pandas(validation_data),
    "test": Dataset.from_pandas(test_data)
})

In [35]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/42607 [00:00<?, ? examples/s]

Map:   0%|          | 0/5398 [00:00<?, ? examples/s]

Map:   0%|          | 0/5396 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 42607
    })
    validation: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 5398
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 5396
    })
})

In [36]:
tokenizer.model_max_length

512

In [37]:
chunk_size = 128

In [71]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()
                             if k != '__index_level_0__'}
    
    concatenated_examples['__index_level_0__'] = examples['__index_level_0__']
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [104]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/42607 [00:00<?, ? examples/s]

Map:   0%|          | 0/5398 [00:00<?, ? examples/s]

Map:   0%|          | 0/5396 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 298
    })
    validation: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 38
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 38
    })
})

In [105]:
# drop ['__index_level_0__', 'token_type_ids', ],

lm_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", 'word_ids'])

In [73]:
# tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'boots [SEP] [CLS] started blog try tie things feel passionate feel related [SEP] [CLS] feel tend overstate things excite casual moviegoer clearly regard retarded [SEP] [CLS] feel like hair planning hostile [SEP] [CLS] feels lovely nourishing but products lower price point [SEP] [CLS] love feeling holiday spirit love wonderful things christmas brings [SEP] [CLS] im likely spend rest day feeling bit naughty undressing strangers eyes hoping ill favourite clients bed soon [SEP] [CLS] don t know just makes feel really clever [SEP] [CLS] got cold feeling extremely grumpy irritable grouchy [SEP] [CLS] think past life really feel amazed seeing moulded amp act situations [SEP] [CLS] remember feeling devastated sat car'

In [106]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [77]:
# samples = [lm_datasets["train"][i] for i in range(2)]
# for sample in samples:
#     _ = sample.pop("word_ids")
# 
# for chunk in data_collator(samples)["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] feel ind excelledcribably enthralled somewhat [MASK] [MASK] [SEP] [CLS] stop feeling helpless [SEP] [CLS] feel irritable ji [MASK]y sick [SEP] [CLS] ive feeling impressed courage patients friends [SEP] [CLS] oscillate speeds no wonder [MASK]hood leaves feeling [MASK] [SEP] [CLS] feel weird talking m hack [SEP] [CLS] sitting coffee fresh [MASK]net [MASK]berries feeling overwhelmed [SEP] [CLS] [MASK] having control actions outcomes saves lot embarrassment but feel like [MASK] lot [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] hadnt honored promise [MASK] writing book series [MASK] years [MASK] husband [MASK] gene simmons [SEP] [CLS] hz [MASK] pain way down limb [SEP] [CLS] im feeling especially naughty sport dirty cop outfit black patent thigh high'

'>>> boots [SEP] [CLS] started blog try tie things feel passionate feel related [SEP] [CLS] feel tend overs [MASK] things ex [MASK] casual moviego [MASK] clearly regard re [MASK]ded [SEP] [CLS] feel like hair planning hostile [SEP] [CLS

In [107]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [108]:
# samples = [lm_datasets["train"][i] for i in range(2)]
# batch = whole_word_masking_data_collator(samples)
# 
# for chunk in batch["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [109]:
train_size = 60  # change later when training, max is 298 chunks??
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 60
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 6
    })
})

In [110]:
from transformers import TrainingArguments

batch_size = 4
# Show the training loss with every epoch
# logging_steps = len(downsampled_dataset["train"]) // batch_size
logging_steps = 1
model_name = f"{model_checkpoint}-wwm"

training_args = TrainingArguments(
    output_dir=f"{model_name}-wmm",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    remove_unused_columns=False,
    # load_best_model_at_end=True,
    # num_train_epochs=8,
)

In [111]:
from transformers import Trainer

# remove the __index_level_0__ column
downsampled_dataset2 = downsampled_dataset.remove_columns("__index_level_0__")


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset2["train"],
    eval_dataset=downsampled_dataset2["test"],
    # data_collator=data_collator,
    data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
)

In [112]:
trainer.train()

ValueError: Expected input batch_size (4) to match target batch_size (512).