In [1]:
import pandas as pd
from transformers import AutoConfig, AutoTokenizer, AutoModel
from transformers import AdamW
import tqdm
import numpy as np
import dataloaders
preprocess = dataloaders.prepos()

In [2]:
train = pd.read_csv('dataset/data/Train-word.csv', sep='\t')
validation = pd.read_csv('dataset/data/Val-word.csv', sep='\t')

In [3]:
# train.groupby('label').count()

In [4]:
config = AutoConfig.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
ParsBERT = AutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# using the tokens from BertTokenizer
sep_token = tokenizer.sep_token
cls_token = tokenizer.cls_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token#using the token ids
sep_token_idx = tokenizer.sep_token_id
cls_token_idx = tokenizer.cls_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [6]:
def transform_dtaframe(df:pd.DataFrame,
                       preprocess):
    df['INPUT_A'] = df['premise'].apply(preprocess.forward)
    df['INPUT_B'] = df['hypothesis'].apply(preprocess.forward)
    df['INPUT'] = df['INPUT_B'] + ' ' + sep_token + ' '  + df['INPUT_A']
    return df
train = transform_dtaframe(train,preprocess)
validation = transform_dtaframe(validation,preprocess)

# train

In [7]:
# list(train['INPUT'])

In [8]:
TOKENIZERS_PARALLELISM = False
input = tokenizer(list(train['INPUT']),padding=True,truncation=True,max_length=150,return_tensors="pt")['input_ids']
input_validation= tokenizer(list(validation['INPUT']),padding=True,truncation=True,max_length=150,return_tensors="pt")['input_ids']

In [9]:
from    torch.utils.data           import Dataset,DataLoader
import torch
class dataset(Dataset):
    def __init__(self , dataframe,input):
        self.dataframe = dataframe
        self.input = input
        #  {0:"contradiction",1: "entailment", 2:"neutral"}
        self.label_map = {'c':0,
                          'e':1,
                          'n':2}
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        
        label = self.label_map[self.dataframe['label'][idx]]
        return self.input[idx] , torch.tensor(label)
    
def collate_fn(batch):
    sources = [item[0] for item in batch]
    targets = [item[1] for item in batch]
              
    sources = torch.nn.utils.rnn.pad_sequence(sources, batch_first=True)
    # targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
    return sources, targets

In [10]:
train_Set = dataset(train,input)
Validation_Set = dataset(validation,input_validation)

In [11]:
train_dataloader = DataLoader(
                                                train_Set,
                                                batch_size=64,
                                                shuffle=True,
                                                num_workers=2,
                                                )
validation_dataloader = DataLoader(
                                                Validation_Set,
                                                batch_size=64,
                                                shuffle=True,
                                                num_workers=2,
                                                )

In [12]:
class CustomClassifier(torch.nn.Module):
    def __init__(self,):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.7)
        self.num_labels = 3
        self.classifier = torch.nn.Linear(768, self.num_labels)
        
    def forward(self, input):
        pooled_output = self.dropout(input)
        logits = self.classifier(pooled_output)

        return logits


classifier = CustomClassifier()

In [17]:
num_epochs = 5


In [18]:
device = 'cuda'


optimizer_Bert = torch.optim.AdamW(ParsBERT.parameters(), lr=2e-6)
"""
1.lr=2e-5

"""
total_steps = len(train_dataloader) * num_epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer_Bert, max_lr=5e-5, steps_per_epoch=len(train_dataloader), epochs=num_epochs)

optimizer_class = torch.optim.Adam(classifier.parameters(),lr=0.00005)


criterion = torch.nn.CrossEntropyLoss()

ParsBERT.to(device)
classifier.to(device)


CustomClassifier(
  (dropout): Dropout(p=0.7, inplace=False)
  (classifier): Linear(in_features=768, out_features=3, bias=True)
)

In [15]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [19]:
for batch in range(num_epochs):
    list_accu_train = torch.zeros(size=[len(train_dataloader)])
    list_accu_validation = torch.zeros(size=[len(validation_dataloader)])

    for i,(sentences,labels) in tqdm.tqdm(enumerate(train_dataloader)):
        optimizer_Bert.zero_grad()
        optimizer_class.zero_grad()
        
        sentences , labels= sentences.to(device) , labels.to(device)
        outputs = ParsBERT(sentences)
        pred = classifier(outputs.pooler_output)
        
        
        
        # accu = 
        list_accu_train[i] = ((pred.argmax(dim=1)==labels).sum()/len(labels))*100
        # print(list_accu[i])


        loss = criterion(pred,labels)
        loss.backward()
        
        
        optimizer_Bert.step()
        optimizer_class.step()
        
        scheduler.step()

    print(list_accu_train.mean(),loss)
    
    with torch.inference_mode():
        ParsBERT.eval()
        classifier.eval()
        for i,(sentences,labels) in tqdm.tqdm(enumerate(validation_dataloader)):
            optimizer_Bert.zero_grad()
            optimizer_class.zero_grad()
            
            sentences , labels= sentences.to(device) , labels.to(device)
            outputs = ParsBERT(sentences)
            pred = classifier(outputs.pooler_output)
            
            # accu = 
            list_accu_validation[i] = ((pred.argmax(dim=1)==labels).sum()/len(labels))*100
            # print(list_accu[i])


            loss = criterion(pred,labels)
    print(list_accu_validation.mean(),loss)
    print(f"batch is {batch}")
        


114it [03:11,  1.68s/it]

tensor(98.6431) tensor(0.0126, device='cuda:0', grad_fn=<NllLossBackward0>)



25it [00:17,  1.45it/s]

tensor(80.3125) tensor(0.0087, device='cuda:0')
batch is 0



114it [03:12,  1.69s/it]

tensor(96.3026) tensor(0.0959, device='cuda:0', grad_fn=<NllLossBackward0>)



25it [00:17,  1.45it/s]

tensor(73.6250) tensor(3.4477, device='cuda:0')
batch is 1



18it [00:32,  1.79s/it]


KeyboardInterrupt: 