https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2 (code below)


https://huggingface.co/transformers/v3.2.0/custom_datasets.html 

In [20]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary #what is that, it works ?
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [4]:
#import the dataset and create a trainig and testing and validation set


isarcasm_data = pd.read_csv('isarcasm2022.csv')

text = isarcasm_data.iloc[: , 1]
isitsarcasm = isarcasm_data.iloc[: , 2]

print(text)
print(isitsarcasm)



0       The only thing I got from college is a caffein...
1       I love it when professors draw a big question ...
2       Remember the hundred emails from companies whe...
3       Today my pop-pop told me I was not “forced” to...
4       @VolphanCarol @littlewhitty @mysticalmanatee I...
                              ...                        
3463    The population spike in Chicago in 9 months is...
3464    You'd think in the second to last English clas...
3465    I’m finally surfacing after a holiday to Scotl...
3466    Couldn't be prouder today. Well done to every ...
3467    Overheard as my 13 year old games with a frien...
Name: tweet, Length: 3468, dtype: object
0       1
1       1
2       1
3       1
4       1
       ..
3463    0
3464    0
3465    0
3466    0
3467    0
Name: sarcastic, Length: 3468, dtype: int64


In [38]:
#create the class that represent the dataset

class IsarcasmDataset(Dataset):
   
    def __init__(self, csv_file, root_dir, transform=None, train_size=0.7, test_size=0.15, val_size=0.15):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.rawisarcasmData = pd.read_csv(csv_file)
        self.isarcasmData = self.rawisarcasmData.sample(frac=1).reset_index(drop=True) #randomly shuffle the set
        self.root_dir = root_dir
        self.transform = transform
        
        train_data, temp_data = train_test_split(self.isarcasmData, test_size=test_size+val_size, random_state=42)
        test_data, val_data = train_test_split(temp_data, test_size=val_size/(test_size+val_size), random_state=42)

        self.train_set = train_data
        self.test_set = test_data
        self.validation_set = val_data

        #self.save_sets()
        
       
    def __len__(self):
        return len(self.isarcasmData)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.isarcasmData.iloc[idx , 1]
        label = self.isarcasmData.iloc[idx , 2]
        
        sample = {'tweet': text, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample
    
    def save_sets(self):
       
        # Save the datasets to separate CSV files
        self.train_set.to_csv('train_dataset.csv', index=False)
        self.test_set.to_csv('test_dataset.csv', index=False)
        self.validation_set.to_csv('val_dataset.csv', index=False)


In [39]:
#instantiate the class

Isarcasm_dataset = IsarcasmDataset(csv_file='isarcasm2022.csv',
                                    root_dir='/')

print(Isarcasm_dataset.__len__()) 
Isarcasm_dataset.__getitem__(4)


#print(Isarcasm_dataset.test_set)

3468
      Unnamed: 0                                              tweet  \
350          809  Semester hasn’t even started and I’ve already ...   
430         2679  How long do you have be out of school before y...   
3290        2608  Going out is overrated! On the couch watching ...   
184         1618  Casually looking at the 06Z forecast it seems ...   
2359        1773                        I passed my permit test!!!🥵   
...          ...                                                ...   
2411        2582     It b the memories we make everyday for me .. 💓   
1812         502  Shaving in the shower without your glasses/con...   
881          684  @DarrkIt @iamspade_ ya need a carry hit me up ...   
3191        1251  Gods playing games today. Makin it 68 degrees ...   
700         2506  https://t.co/ZOkVLoNM6Z\r\n#king810 @KING810FL...   

      sarcastic                                           rephrase  sarcasm  \
350           1  I would say “this isn’t going to be a fun seme

In [None]:
#dataloader 

#dataloader = DataLoader(transformed_dataset, batch_size=4,
                        #shuffle=True, num_workers=0)

In [54]:
class BertDataset(Dataset):
    def __init__(self, tokenizer,max_length):
        super(BertDataset, self).__init__()
        #self.root_dir=root_dir
        self.train_csv=pd.read_csv('val_dataset.csv')
        self.tokenizer=tokenizer
        self.target=self.train_csv.iloc[:,2] #label in column 2
        self.max_length=max_length
        
    def __len__(self):
        return len(self.train_csv)
    
    def __getitem__(self, index):
        
        text1 = self.train_csv.iloc[index,1] #text im column 1
        
        inputs = self.tokenizer.encode_plus( 
            
        #tokenizer settings https://stackoverflow.com/questions/61708486/whats-difference-between-tokenizer-encode-and-tokenizer-encode-plus-in-hugging
        #https://huggingface.co/docs/transformers/en/main_classes/tokenizer   
        #https://huggingface.co/docs/transformers/v4.40.1/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.encode_plus
        
            text1 , #sequnce to be encoded
            None,  #text_pair ??
            pad_to_max_length=True,
            add_special_tokens=True, #https://stackoverflow.com/questions/71679626/what-is-so-special-about-special-tokens
            return_attention_mask=True,
            max_length=self.max_length,
        )
        
        # encodes the text using the BERT tokenizer's encode_plus method, which returns the input IDs, token type IDs, and attention mask.
        ids = inputs["input_ids"] #not really understood this part...
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.train_csv.iloc[index, 2], dtype=torch.long) #label im column 2
            }
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased") 

'''
#in an other project they use that
DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

'''


dataset= BertDataset(tokenizer, max_length=100) #instanciate the dataset, adapt
dataloader=DataLoader(dataset=dataset,batch_size=32) #from, torch adapt

In line 4, we have initialized our pre-trained ‘bert-base-uncased’ BERT model from Hugging face library and followed by initializing our linear dense layer for classifying movie reviews.

Here, we use BCEWithLogitsLoss which combines a Sigmoid layer and the BCELoss in one single class because this version is more numerically stable than using a plain Sigmoid followed by a BCELoss

In [55]:
#https://huggingface.co/docs/transformers/en/model_doc/bert
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, 1)
        
    def forward(self,ids,mask,token_type_ids):
        _,o2= self.bert_model(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)
        
        out= self.out(o2)
        
        return out
    
model=BERT()

loss_fn = nn.BCEWithLogitsLoss()

#Initialize Optimizer
optimizer= optim.Adam(model.parameters(),lr= 0.0001)


irst, we do not retrain our pre-trained BERT and train only the last linear dense layer.
For this, we need to define it as follows :

In [56]:
for param in model.bert_model.parameters():
    param.requires_grad = False #fine tuning only not from 0

In [57]:
def finetune(epochs,dataloader,model,loss_fn,optimizer):
    model.train()
    for  epoch in range(epochs):
        print(epoch)
        
        loop=tqdm(enumerate(dataloader),leave=False,total=len(dataloader))
        for batch, dl in loop:
            ids=dl['ids']
            token_type_ids=dl['token_type_ids']
            mask= dl['mask']
            label=dl['target']
            label = label.unsqueeze(1)
            
            optimizer.zero_grad()
            
            output=model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids)
            label = label.type_as(output)

            loss=loss_fn(output,label)
            loss.backward()
            
            optimizer.step()
            
            pred = np.where(output >= 0, 1, 0)

            num_correct = sum(1 for a, b in zip(pred, label) if a[0] == b[0])
            num_samples = pred.shape[0]
            accuracy = num_correct/num_samples
            
            print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
            
            # Show progress while training
            loop.set_description(f'Epoch={epoch}/{epochs}')
            loop.set_postfix(loss=loss.item(),acc=accuracy)

    return model

In [59]:
model=finetune(5, dataloader, model, loss_fn, optimizer)

0


Epoch=0/5:   6%|▌         | 1/17 [00:04<01:09,  4.34s/it, acc=0.75, loss=0.645]

Got 24 / 32 with accuracy 75.00


Epoch=0/5:  12%|█▏        | 2/17 [00:09<01:09,  4.60s/it, acc=0.719, loss=0.645]

Got 23 / 32 with accuracy 71.88


Epoch=0/5:  18%|█▊        | 3/17 [00:13<01:02,  4.46s/it, acc=0.719, loss=0.632]

Got 23 / 32 with accuracy 71.88


Epoch=0/5:  24%|██▎       | 4/17 [00:17<00:56,  4.36s/it, acc=0.812, loss=0.604]

Got 26 / 32 with accuracy 81.25


Epoch=0/5:  29%|██▉       | 5/17 [00:22<00:52,  4.40s/it, acc=0.719, loss=0.626]

Got 23 / 32 with accuracy 71.88


Epoch=0/5:  35%|███▌      | 6/17 [00:27<00:52,  4.75s/it, acc=0.781, loss=0.598]

Got 25 / 32 with accuracy 78.12


Epoch=0/5:  41%|████      | 7/17 [00:33<00:50,  5.03s/it, acc=0.688, loss=0.638]

Got 22 / 32 with accuracy 68.75


                                                                                

Got 27 / 32 with accuracy 84.38




ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.