## Reference: https://www.kaggle.com/code/shreydan/deberta-v3-base-accelerate-finetuning

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel,DebertaTokenizer
import pandas as pd
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import gc

from accelerate import Accelerator



In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:0


In [3]:
import os

#os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES']  = "0"

torch.cuda.device_count()

2

In [4]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [5]:
# dataframe 병합, prompt id 기준으로 각 아이디에 맞는 prompts_train, test 내용을 왼쪽에 붙임
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

train

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [6]:
train['merged_text'] = 'text '+train['text'] + ' prompt_question ' + train['prompt_question'] + ' prompt_title '+train['prompt_title']+ ' prompt_text '+train['prompt_text']
train['merged_text'].head(2)

0    text The third wave was an experimentto see ho...
1    text They would rub it up with soda to make th...
Name: merged_text, dtype: object

In [7]:
test['merged_text'] = 'text '+ test['text'] + ' prompt_question ' + test['prompt_question'] + ' prompt_title '+ test['prompt_title']+ ' prompt_text '+ test['prompt_text']
test['merged_text'].head(2)

0    text Example text 1 prompt_question Summarize....
1    text Example text 2 prompt_question Summarize....
Name: merged_text, dtype: object

In [8]:
demo_train = train[:100]

In [9]:
config = {
    'model': '/kaggle/input/debertav3base',
    'dropout': 0.5,
    'max_length': 512,
    'batch_size': 8, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 10,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [10]:
print(config)

{'model': '/kaggle/input/debertav3base', 'dropout': 0.5, 'max_length': 512, 'batch_size': 8, 'epochs': 10, 'lr': 0.0003, 'enable_scheduler': True, 'scheduler': 'CosineAnnealingWarmRestarts', 'gradient_accumulation_steps': 2, 'adam_eps': 1e-06, 'freeze_encoder': True}


In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
class DebertaDataset:
    def __init__(self, df, config, tokenizer=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.classes = ['content','wording']
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __getitem__(self,idx):
        sample = self.df['merged_text'][idx]
        tokenized = tokenizer.encode_plus(sample,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )

        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }

        if self.is_test == True:
            return inputs

        label = self.df.loc[idx,self.classes].to_list()
        targets = {
            "labels": torch.tensor(label, dtype=torch.float32),
        }

        return inputs, targets

    def __len__(self):
        return len(self.df)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_df, val_df = train_test_split(demo_train,test_size=0.2,random_state=42,shuffle=True)
print('dataframe shapes:',train_df.shape, val_df.shape)

dataframe shapes: (80, 9) (20, 9)


In [15]:
train_ds = DebertaDataset(train_df, config, tokenizer=tokenizer)
val_ds = DebertaDataset(val_df, config, tokenizer=tokenizer)
test_ds = DebertaDataset(test, config, tokenizer=tokenizer, is_test=True)

In [16]:
train_loader = torch.utils.data.DataLoader(train_ds,
                                           batch_size=config['batch_size'],
                                           shuffle=True,
                                           num_workers=2,
                                           pin_memory=True
                                          )
val_loader = torch.utils.data.DataLoader(val_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=True,
                                         num_workers=2,
                                         pin_memory=True
                                        )

In [17]:
print('loader shapes:',len(train_loader), len(val_loader))

loader shapes: 10 3


# Model

In [18]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [19]:
class DebertaModel(nn.Module):
    def __init__(self,config,num_classes=2):
        super(DebertaModel,self).__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']

        self.encoder = AutoModel.from_pretrained(self.model_name).to(device)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False

        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64).to(device)
        self.fc2 = nn.Linear(64,num_classes).to(device)


    def forward(self,inputs):
        #print("Deberta forward encoder:",next(self.encoder.parameters()).device)
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask']).to(device)
        outputs = self.fc1(outputs).to(device)
        outputs = self.fc2(outputs).to(device)
        return outputs

In [20]:
model_save_path = '/kaggle/working/model.pt' # /kaggle/working/model.pt

In [21]:
class Trainer:

    def __init__(self, model, loaders, config, accelerator):
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','token_type_ids','attention_mask']
        self.accelerator = accelerator

        self.optim = self._get_optim()

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.train_losses = []
        self.val_losses = []

    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model,
            self.optim,
            self.train_loader,
            self.val_loader,
            self.scheduler
        )

    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer

    # RMSE
    def loss_fn(self, outputs, targets):
        #print('target', targets)
        #print("Targets shape:", targets.shape)
        #print("Outputs shape:", outputs.shape)
        #print(torch.mean(torch.square(targets - outputs), dim=0))
        colwise_mse = torch.mean(torch.square(targets - outputs), dim=0)
        loss = torch.mean(torch.sqrt(colwise_mse), dim=0)
        return loss

    # Train Loop
    def train_one_epoch(self,epoch):

        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))
        #print("progress: ", progress)
        # input, targets
        for idx,(inputs,targets) in enumerate(progress):
          #######
            for batch in self.train_loader:
              inputs = {k: v.to(device=accelerator.device) for k, v in batch[0].items()}
              targets = {k: v.to(device=accelerator.device) for k, v in batch[1].items()}

            with self.accelerator.accumulate(self.model):
                #print("T Input Device:", inputs['attention_mask'].device)
                #print("T Target Device:", targets['labels'].device)
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets['labels'])
                running_loss += loss.item()

                self.accelerator.backward(loss)

                self.optim.step()

                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                self.optim.zero_grad()

                del inputs, targets, outputs, loss


        train_loss = running_loss/len(self.train_loader)
        #print(f"Epoch {epoch+1} Loss: {train_loss}")
        self.train_losses.append(train_loss)

    # Validation Loop
    @torch.no_grad()
    def valid_one_epoch(self,epoch):
        min_val_loss = np.inf
        running_loss = 0.
        progress = tqdm(self.val_loader, total=len(self.val_loader))
        ########
        for batch in self.train_loader:
          inputs = {k: v.to(device=accelerator.device) for k, v in batch[0].items()}
          targets = {k: v.to(device=accelerator.device) for k, v in batch[1].items()}
        for (inputs, targets) in progress:
            #print("V Input Device:", inputs['attention_mask'].device)
            #print("V Target Device:", targets['labels'].device)
            outputs = self.model(inputs)

            loss = self.loss_fn(outputs, targets['labels'])
            running_loss += loss.item()

            del inputs, targets, outputs, loss


        val_loss = running_loss/len(self.val_loader)
        #print(f"Validation Loss: {val_loss}")
        self.val_losses.append(val_loss)

        if val_loss < min_val_loss:
            #patience = 0
            min_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f'saving model with score: {val_loss}')

    # Test
    def test(self, test_loader):
        #for batch in test_loader:
          #inputs = {k: v.to(device) for k, v in batch.items()}

        preds = []
        for (batch) in test_loader: # inputs
          inputs = {k: v.to(device) for k, v in batch.items()}
          outputs = self.model(inputs)
          preds.append(outputs.detach().cpu())
          print(preds)
        preds = torch.concat(preds)

        return preds

    def fit(self):

        patience = 0
        self.prepare()

        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            leave = True,
            desc="Training..."
        )

        for epoch in fit_progress:

            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch)
            self.clear()

            self.model.eval()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch)

            patience += 1
            if patience >= 10:
              print('Early Stopping trigerred on epoch: {}')
              break

            self.clear()

            print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            print(f"train loss: {self.train_losses[-1]}")
            print(f"valid loss: {self.val_losses[-1]}\n\n")


    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

# Train with huggingface Accelerate

In [22]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [23]:
model = DebertaModel(config).to(device=accelerator.device)

In [24]:
print(next(model.parameters()).device)

cuda:0


In [25]:
trainer = Trainer(model, (train_loader, val_loader), config, accelerator)

In [26]:
'''for batch in train_loader:
    # Check the device of the tensors in the batch
    inputs = {k: v.to('cuda') for k, v in batch[0].items()}
    targets = {k: v.to('cuda') for k, v in batch[1].items()}
    for key, value in inputs.items():
        if isinstance(value, torch.Tensor):
            print(f"Tensor '{key}' is on device: {value.device}")'''

'for batch in train_loader:\n    # Check the device of the tensors in the batch\n    inputs = {k: v.to(\'cuda\') for k, v in batch[0].items()}\n    targets = {k: v.to(\'cuda\') for k, v in batch[1].items()}\n    for key, value in inputs.items():\n        if isinstance(value, torch.Tensor):\n            print(f"Tensor \'{key}\' is on device: {value.device}")'

In [27]:
trainer.fit()

Training...:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.203389048576355
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 1 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9600032269954681
valid loss: 1.203389048576355




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.2474627296129863
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 2 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 1.073867493867874
valid loss: 1.2474627296129863




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.030956248442332
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 3 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9612085878849029
valid loss: 1.030956248442332




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.1273335218429565
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 4 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9967163562774658
valid loss: 1.1273335218429565




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.1928850412368774
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 5 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9425550282001496
valid loss: 1.1928850412368774




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.0135352611541748
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 6 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9645654678344726
valid loss: 1.0135352611541748




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 0.9041284720102946
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 7 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 1.101974332332611
valid loss: 0.9041284720102946




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.0201706091562908
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 8 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.9535142004489898
valid loss: 1.0201706091562908




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.074762225151062
➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️ EPOCH 9 / 10 ➖️➖️➖️➖️➖️➖️➖️➖️➖️➖️
train loss: 0.860908567905426
valid loss: 1.074762225151062




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving model with score: 1.056887725989024
Early Stopping trigerred on epoch: {}


In [28]:
from torch.utils.data import TensorDataset, DataLoader

In [29]:
test_loader = torch.utils.data.DataLoader(test_ds,
                                         batch_size=config['batch_size'],
                                         shuffle=True,
                                         num_workers=2,
                                         pin_memory=True
                                        )

In [30]:
#for i in model.named_parameters():
#    print(f"{i[0]} -> {i[1].device}")

In [31]:
checkpoint = DebertaModel(config).to(device=accelerator.device)
checkpoint.load_state_dict(torch.load(model_save_path))

<All keys matched successfully>

In [32]:
tester = Trainer(checkpoint, (train_loader, val_loader), config, accelerator)
results = tester.test(test_loader)

[tensor([[0.7329, 0.0637],
        [0.8478, 0.1977],
        [0.7344, 0.0567],
        [0.8004, 0.1642]])]


In [33]:
final = results.numpy()
final

array([[0.73286206, 0.06365981],
       [0.847786  , 0.19766265],
       [0.7343775 , 0.05674871],
       [0.8003521 , 0.16415498]], dtype=float32)

In [34]:
submission_df = pd.DataFrame({
    'student_id': test['student_id'],
    'content': [pred[0] for pred in final],
    'wording': [pred[1] for pred in final]
})

In [35]:
submission_df.to_csv('submission.csv', index=False)
submission_df

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.732862,0.06366
1,111111eeeeee,0.847786,0.197663
2,222222cccccc,0.734378,0.056749
3,333333dddddd,0.800352,0.164155


In [36]:
#import matplotlib.pyplot as plt

In [37]:
'''plt.plot(trainer.train_losses, color='red')
plt.plot(trainer.val_losses, color='orange')
plt.title('MCRMSE Loss')
plt.legend(['Train', 'Validation'], loc='upper right')'''

"plt.plot(trainer.train_losses, color='red')\nplt.plot(trainer.val_losses, color='orange')\nplt.title('MCRMSE Loss')\nplt.legend(['Train', 'Validation'], loc='upper right')"