In [6]:
# !pip install accelerate
# !pip install datasets



In [7]:
# !pip install peft



### Imports

In [8]:
import os
import nltk
from datasets import DatasetDict
from sklearn.feature_extraction import text
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from transformers import TrainerCallback
from transformers import DataCollatorWithPadding

from peft import PeftModel, LoraConfig, get_peft_model

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# if torch.cuda.is_available():
#     print("CUDA is available! ")
# else:
#     print("CUDA is not available.")

# print(torch.version.cuda)

CUDA is available! 
12.1


In [43]:
# from google.colab import drive
# drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Pre- processing

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_dir = "drive/MyDrive/data"

data = pd.read_json(os.path.join(data_dir, 'data.jsonl'), lines=True)
test_data = pd.read_json(os.path.join(data_dir, 'test_final.jsonl'), lines=True)
train_data = pd.read_json(os.path.join(data_dir, 'train_final.jsonl'), lines=True)
validation_data = pd.read_json(os.path.join(data_dir, 'validation_final.jsonl'), lines=True)


# Remove duplicates
test_data = test_data.drop_duplicates(subset=['text'])
train_data = train_data.drop_duplicates(subset=['text'])
validation_data = validation_data.drop_duplicates(subset=['text'])


## Loading BERT pre-trained model

In [45]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

# Initialize the BERT tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Loading LoRA

In [46]:
lora_config = LoraConfig(
    r=16, #8
    lora_alpha=64, #32
    target_modules=[
        "bert.encoder.layer.{}.attention.self.query".format(i) for i in range(16)
    ] + [
        "bert.encoder.layer.{}.attention.self.value".format(i) for i in range(16)
    ],
    lora_dropout=0.01,
    bias="lora_only"
)

## Apply LoRA to the model

In [47]:
model = get_peft_model(model, lora_config)
print(model)

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=76

## Tokenize data

In [48]:
# Tokenize data
def tokenize_data(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_data(train_data['text'].tolist())
val_encodings = tokenize_data(validation_data['text'].tolist())
test_encodings = tokenize_data(test_data['text'].tolist())


### Create dataset

In [49]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_data['label'].tolist())
val_dataset = TextDataset(val_encodings, validation_data['label'].tolist())
test_dataset = TextDataset(test_encodings, test_data['label'].tolist())

#### Train model

In [50]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [51]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    label_names=["labels"]
)

print("Training labels range: ", min(train_data['label']), "to", max(train_data['label']))
print("Validation labels range: ", min(validation_data['label']), "to", max(validation_data['label']))
print("Test labels range: ", min(test_data['label']), "to", max(test_data['label']))

print("Training data NaN values:", train_data.isnull().values.any())
print("Validation data NaN values:", validation_data.isnull().values.any())
print("Test data NaN values:", test_data.isnull().values.any())

 # Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print(trainer)



Training labels range:  0 to 5
Validation labels range:  0 to 5
Test labels range:  0 to 5
Training data NaN values: False
Validation data NaN values: False
Test data NaN values: False
<transformers.trainer.Trainer object at 0x7bf83d6cad40>


In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7338,1.383233,0.571693,0.560143,0.562822,0.571693
2,1.4373,1.115568,0.710078,0.701633,0.712215,0.710078
3,1.1999,0.984909,0.714153,0.722558,0.782986,0.714153
4,0.9653,0.898339,0.771397,0.777171,0.809372,0.771397
5,0.9048,0.837209,0.798073,0.802868,0.829073,0.798073
6,0.8539,0.788029,0.819192,0.822448,0.844418,0.819192




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7338,1.383233,0.571693,0.560143,0.562822,0.571693
2,1.4373,1.115568,0.710078,0.701633,0.712215,0.710078
3,1.1999,0.984909,0.714153,0.722558,0.782986,0.714153
4,0.9653,0.898339,0.771397,0.777171,0.809372,0.771397
5,0.9048,0.837209,0.798073,0.802868,0.829073,0.798073
6,0.8539,0.788029,0.819192,0.822448,0.844418,0.819192
7,0.7897,0.758742,0.830122,0.834272,0.856103,0.830122
8,0.773,0.744819,0.835865,0.839649,0.858728,0.835865




TrainOutput(global_step=5328, training_loss=1.0363913899785406, metrics={'train_runtime': 4896.5723, 'train_samples_per_second': 69.611, 'train_steps_per_second': 1.088, 'total_flos': 2.257595329665024e+16, 'train_loss': 1.0363913899785406, 'epoch': 8.0})

In [53]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.7448194026947021, 'eval_accuracy': 0.8358651352352723, 'eval_f1': 0.8396486610585813, 'eval_precision': 0.8587281513341083, 'eval_recall': 0.8358651352352723, 'eval_runtime': 34.2597, 'eval_samples_per_second': 157.561, 'eval_steps_per_second': 2.481, 'epoch': 8.0}


In [54]:
model_path = f'./my_trained_models/bert-lora'
trainer.save_model(model_path)

