In [1]:
import os
import torch
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer_config = {
    'pretrained_model_name_or_path' : r'C:\experiments\bert_tokenizer',
}

In [3]:
import random
from torch.utils.data import Dataset, DataLoader

random.seed(42)

class ReviewDataset(Dataset):
    def __init__(self, dir_path, split, tokenizer_config, val=False):
        super().__init__()
        self.review_tags = ['pos', 'neg']
        self.pos_review_dir = os.path.join(dir_path, split, 'pos')
        self.neg_review_dir = os.path.join(dir_path, split, 'neg')
        
        if split == 'train':
            if not val:
                self.pos_label = [self.review_tags[0] for i in os.listdir(self.pos_review_dir)][:200]
                self.neg_label = [self.review_tags[1] for i in os.listdir(self.neg_review_dir)][:200]

                self.reviews = [i for i in os.listdir(self.pos_review_dir)][:200] + [i for i in os.listdir(self.neg_review_dir)][:200]
            else:
                self.pos_label = [self.review_tags[0] for i in os.listdir(self.pos_review_dir)][200:250]
                self.neg_label = [self.review_tags[1] for i in os.listdir(self.neg_review_dir)][200:250]

                self.reviews = [i for i in os.listdir(self.pos_review_dir)][200:250] + [i for i in os.listdir(self.neg_review_dir)][200:250]
        else:
            self.pos_label = [self.review_tags[0] for i in os.listdir(self.pos_review_dir)][:50]
            self.neg_label = [self.review_tags[1] for i in os.listdir(self.neg_review_dir)][:50]

            self.reviews = [i for i in os.listdir(self.pos_review_dir)][:50] + [i for i in os.listdir(self.neg_review_dir)][:50] 
            
        self.label = self.pos_label + self.neg_label
        self.data = [{self.reviews[i] : self.label[i]} for i in range(0, len(self.label))]
        self.tokenizer = BertTokenizer.from_pretrained(**tokenizer_config)
        self.sep_token = self.tokenizer.sep_token
        random.shuffle(self.data)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = self.data[index]
        filename, label = list(sample.items())[0]
        if label == 'pos':
            file_path = os.path.join(self.pos_review_dir, filename)
        else:
            file_path = os.path.join(self.neg_review_dir, filename)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()


        sentences = text.split('.')
        sentences = [i.strip().lower() for i in sentences]
        sentences = [i.replace('<br />', '') for i in sentences]
        sentences = [f'{i} {self.sep_token}' for i in sentences if len(i) != 0]
        text = "".join(sentences) + self.sep_token

        tokens = self.tokenizer(text,
                                max_length=128,
                                truncation=True,
                                padding='max_length',
                                return_tensors='pt').to(device='cuda')
        if label == 'pos':
            return tokens, torch.tensor([1, 0], dtype=torch.float32)
        else:
            return tokens, torch.tensor([0, 1], dtype=torch.float32)

In [4]:
class BertClassifier(torch.nn.Module):
    def __init__(self, num_classes, model_config):
        super().__init__()
        self.num_classes = num_classes
        self.model = BertModel.from_pretrained(**model_config)
        self.classifier = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
        self.dropout = torch.nn.Dropout(p=0.7, inplace=True)
        self.softmax = torch.nn.Softmax(dim=1)
    
    def forward(self, x):
        op1 = self.model(**x)
        op2 = self.classifier(op1.pooler_output)
        op3 = self.dropout(op2)
        op4 = self.softmax(op3)
        return op4

In [5]:
model_config = {'pretrained_model_name_or_path' : r'C:\experiments\bert_base_uncased'}
model = BertClassifier(num_classes=2, model_config=model_config).to(device='cuda')
state_dict = torch.load(r'bert_uncased.pth', weights_only=True)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [6]:
import torch.nn.functional as F

class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev=1/torch.sqrt(torch.tensor(rank).float())
        self.A=torch.nn.Parameter(torch.randn(in_dim, rank)*std_dev)
        self.B=torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha=alpha
        
    def forward(self, x):
        x=self.alpha*(x@self.A@self.B)
        return x
    

class LinearWithLoRAMerged(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear=linear
        self.lora=LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
    
    def forward(self, x):
        lora=self.lora.A @ self.lora.B # combine LoRA metrices
        # then combine LoRA original weights
        combined_weight=self.linear.weight+self.lora.alpha*lora.T
        return F.linear(x, combined_weight, self.linear.bias)

In [7]:
for i in range(0, 11):
    model.model.encoder.layer[i].attention.self.query = LinearWithLoRAMerged(
        linear=model.model.encoder.layer[i].attention.self.query,
        rank=3,
        alpha=0.6
    )
    model.model.encoder.layer[i].attention.self.key = LinearWithLoRAMerged(
        linear=model.model.encoder.layer[i].attention.self.key,
        rank=3,
        alpha=0.6
    )
    model.model.encoder.layer[i].attention.self.value = LinearWithLoRAMerged(
        linear=model.model.encoder.layer[i].attention.self.value,
        rank=3,
        alpha=0.6
    )

model = model.to(device='cuda')

In [8]:
## Freezing all attention layers in the bert encoder
for name, params in model.named_parameters():
    if ('lora' not in name):
        params.requires_grad = False


for name, param in model.named_parameters():
    print(f'{name}:{param.requires_grad}')

model.embeddings.word_embeddings.weight:False
model.embeddings.position_embeddings.weight:False
model.embeddings.token_type_embeddings.weight:False
model.embeddings.LayerNorm.weight:False
model.embeddings.LayerNorm.bias:False
model.encoder.layer.0.attention.self.query.linear.weight:False
model.encoder.layer.0.attention.self.query.linear.bias:False
model.encoder.layer.0.attention.self.query.lora.A:True
model.encoder.layer.0.attention.self.query.lora.B:True
model.encoder.layer.0.attention.self.key.linear.weight:False
model.encoder.layer.0.attention.self.key.linear.bias:False
model.encoder.layer.0.attention.self.key.lora.A:True
model.encoder.layer.0.attention.self.key.lora.B:True
model.encoder.layer.0.attention.self.value.linear.weight:False
model.encoder.layer.0.attention.self.value.linear.bias:False
model.encoder.layer.0.attention.self.value.lora.A:True
model.encoder.layer.0.attention.self.value.lora.B:True
model.encoder.layer.0.attention.output.dense.weight:False
model.encoder.layer.0.

In [9]:
train_dataset = ReviewDataset(
                    dir_path=r'C:\experiments\notebooks\data\reviews\aclImdb',
                    split='train',
                    tokenizer_config=tokenizer_config
)

val_dataset = ReviewDataset(
                    dir_path=r'C:\experiments\notebooks\data\reviews\aclImdb',
                    split='train',
                    tokenizer_config=tokenizer_config,
                    val=True
)


test_dataset = ReviewDataset(
                    dir_path=r'C:\experiments\notebooks\data\reviews\aclImdb',
                    split='test',
                    tokenizer_config=tokenizer_config
)

print(f'Number of reviews in training : {train_dataset.__len__()}')
print(f'Number of reviews in validaton: {val_dataset.__len__()}')
print(f'Number of reviews in test : {test_dataset.__len__()}')


Number of reviews in training : 400
Number of reviews in validaton: 100
Number of reviews in test : 100


In [10]:
train_loader = DataLoader(
                    dataset=train_dataset,
                    batch_size=16,
                    shuffle=True)

val_loader = DataLoader(
                    dataset=val_dataset,
                    batch_size=16,
                    shuffle=True)

test_loader = DataLoader(
                    dataset=test_dataset,
                    batch_size=16,
                    shuffle=True)

In [11]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)
criterion = torch.nn.BCEWithLogitsLoss()

In [12]:
EPOCHS = 20

In [13]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import copy  # For deep copying the best model

TRAIN_LOSS, VAL_LOSS = list(), list()
TRAIN_ACC, VAL_ACC = list(), list()

best_val_acc = 0
patience, count = 5, 0  # Patience defines how many epochs to wait before stopping
best_model = None  # To store the best model

for epoch in range(EPOCHS):
    train_loss, train_acc = 0, 0
    model.train()
    
    # Training loop
    for tokens, label in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{EPOCHS}"):
        inputs = {
            'input_ids': tokens['input_ids'].squeeze(1),
            'token_type_ids': tokens['token_type_ids'].squeeze(1),
            'attention_mask': tokens['attention_mask'].squeeze(1)
        }
        target = label.to(device='cuda')

        optimizer.zero_grad()
        op = model(inputs)
        loss = criterion(op, target)
        acc = accuracy_score(
            torch.argmax(op, dim=1).detach().cpu().numpy(),
            torch.argmax(target, dim=1).detach().cpu().numpy()
        )
        train_loss += loss.item()
        train_acc += acc

        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    # Validation loop
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for tokens, label in tqdm(val_loader, desc="Validation"):
            inputs = {
                'input_ids': tokens['input_ids'].squeeze(1),
                'token_type_ids': tokens['token_type_ids'].squeeze(1),
                'attention_mask': tokens['attention_mask'].squeeze(1)
            }
            target = label.to(device='cuda')

            op = model(inputs)
            loss = criterion(op, target)
            acc = accuracy_score(
                torch.argmax(op, dim=1).detach().cpu().numpy(),
                torch.argmax(target, dim=1).detach().cpu().numpy()
            )
            val_loss += loss.item()
            val_acc += acc

    val_loss /= len(val_loader)
    val_acc /= len(val_loader)

    # Logging
    print(f'EPOCH: {epoch + 1}\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    TRAIN_LOSS.append(train_loss)
    VAL_LOSS.append(val_loss)
    TRAIN_ACC.append(train_acc)
    VAL_ACC.append(val_acc)

    # Early Stopping Check
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = copy.deepcopy(model)  # Save the best model
        count = 0  # Reset patience counter
    else:
        count += 1
        print(f"Early stopping patience: {count}/{patience}")
        if count >= patience:
            print("Early stopping triggered.")
            break

Training Epoch 1/20: 100%|██████████| 25/25 [00:07<00:00,  3.56it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.78it/s]


EPOCH: 1
Train Loss: 0.6316, Train Acc: 0.7300
Val Loss: 0.6300, Val Acc: 0.7679


Training Epoch 2/20: 100%|██████████| 25/25 [00:06<00:00,  3.88it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.64it/s]


EPOCH: 2
Train Loss: 0.6346, Train Acc: 0.7350
Val Loss: 0.6092, Val Acc: 0.7946


Training Epoch 3/20: 100%|██████████| 25/25 [00:06<00:00,  3.87it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s]


EPOCH: 3
Train Loss: 0.6271, Train Acc: 0.7450
Val Loss: 0.5991, Val Acc: 0.8393


Training Epoch 4/20: 100%|██████████| 25/25 [00:06<00:00,  3.86it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s]


EPOCH: 4
Train Loss: 0.6285, Train Acc: 0.7350
Val Loss: 0.6004, Val Acc: 0.8304
Early stopping patience: 1/5


Training Epoch 5/20: 100%|██████████| 25/25 [00:06<00:00,  3.87it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.76it/s]


EPOCH: 5
Train Loss: 0.6248, Train Acc: 0.7350
Val Loss: 0.6035, Val Acc: 0.8304
Early stopping patience: 2/5


Training Epoch 6/20: 100%|██████████| 25/25 [00:06<00:00,  3.88it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.78it/s]


EPOCH: 6
Train Loss: 0.6249, Train Acc: 0.7475
Val Loss: 0.6120, Val Acc: 0.8036
Early stopping patience: 3/5


Training Epoch 7/20: 100%|██████████| 25/25 [00:06<00:00,  3.85it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s]


EPOCH: 7
Train Loss: 0.6285, Train Acc: 0.7225
Val Loss: 0.6004, Val Acc: 0.8304
Early stopping patience: 4/5


Training Epoch 8/20: 100%|██████████| 25/25 [00:06<00:00,  3.86it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.65it/s]

EPOCH: 8
Train Loss: 0.6330, Train Acc: 0.7100
Val Loss: 0.6106, Val Acc: 0.8125
Early stopping patience: 5/5
Early stopping triggered.





In [15]:
test_loss, test_acc = 0, 0
best_model.eval()
for tokens, label in tqdm(test_loader):
    inputs = {
                    'input_ids': tokens['input_ids'].squeeze(1),
                    'token_type_ids' : tokens['token_type_ids'].squeeze(1),
                    'attention_mask' : tokens['attention_mask'].squeeze(1)
            }
    target = label.to(device='cuda')

    with torch.no_grad():
            op = best_model(inputs)
            loss = criterion(op, target)
            acc = accuracy_score(
                    torch.argmax(op, dim=1).detach().cpu().numpy(), 
                    torch.argmax(target, dim=1).detach().cpu().numpy()
            )
            
    test_loss += loss.item()
    test_acc += acc

print(f'Test Loss: {test_loss/len(test_loader)} Test Acc: {test_acc/len(test_loader)}')

100%|██████████| 7/7 [00:01<00:00,  5.42it/s]

Test Loss: 0.6095321348735264 Test Acc: 0.7857142857142857





In [16]:
torch.save(best_model.state_dict(), "bert_uncased_lora.pth")
print("Model state dictionary saved to 'model.pth'")

Model state dictionary saved to 'model.pth'
