# Data Loading

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Initialize the accelerator
accelerator = Accelerator()

# read data
train_raw = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# Package Importing

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, AutoConfig, RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Sampler, Dataset, DataLoader
from accelerate import Accelerator
from tqdm import tqdm
import random
import copy
import os
import multiprocessing
from sklearn.model_selection import StratifiedKFold
import string
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import OrderedDict

 # Preprocessing

In [1]:
# remove duplicates
train_no_duplicates = train_raw.drop_duplicates('text')
# remove contradictory
duplicates_df = train_raw[train_raw.text.duplicated(keep=False)].sort_values('text')
contradictory_tweets = set()
for tweet in list(duplicates_df.text):
    if len(set(duplicates_df[duplicates_df['text'] == tweet].target)) > 1:
        contradictory_tweets.add(tweet)

contradictory_tweets = list(contradictory_tweets)

filtered_df = train_no_duplicates[~train_no_duplicates['text'].isin(contradictory_tweets)]

# Text cleaning function
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from text
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase to maintain consistency
    text = text.lower()
    return text

# Apply the text cleaning function
filtered_df['clean_text'] = filtered_df['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)

def combine_columns(row):
    values = [f"{col}: {str(row[col])}" for col in row.index[1:-2] if pd.notnull(row[col])]
    return ' '.join(values)

# Combine the three columns into a single column
filtered_df['combined'] = filtered_df.apply(combine_columns, axis=1)
filtered_df['final_text'] =  filtered_df['clean_text']+' '+ filtered_df['combined']

#test set
filtered_test = test.copy()
filtered_test['combined'] = filtered_test.apply(combine_columns, axis=1)
filtered_test['final_text'] =  filtered_test['clean_text']+' '+ filtered_test['combined']

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv




18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['clean_text'] = filtered_df['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['combined'] = filtered_df.apply(combine_columns, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['final_text'] =  filtered_df['clean_text']+' '+ filtered_d

# BERT Model

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, target='target', max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append(((tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device)), torch.tensor(row[target]).to(device)))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def train_valid_split(self, train_fraction=.8, shuffle=True):
        num_train_examples = int(len(self) * train_fraction)
        train_dataset = copy.deepcopy(self)
        
        if shuffle:
            random.shuffle(train_dataset.data)
        
        valid_dataset = copy.deepcopy(train_dataset)
        train_dataset.data = train_dataset.data[:num_train_examples]
        valid_dataset.data = valid_dataset.data[num_train_examples:]
        
        return train_dataset, valid_dataset

In [None]:
class MyDatasetTest(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append((torch.tensor(row['id']).to(device) ,(tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device))))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class FinetuneClassifier(nn.Module):
    def __init__(self, model=model_checkpoint, classes=2, head_dropout=0.2):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model)
        hidden_size = self.model.config.hidden_size
        
        self.project = torch.nn.Sequential(
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, classes) # projection
        )

    def forward(self, input_ids, attention_mask=None):
        res = self.model.forward(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        res = res[0]
        res = res[:,0,:] # encoding for <s> token
        res = self.project(res)
        return res
    
    def parameters_num(self):
        return sum(p.numel() for p in self.parameters())

In [None]:
def train(model, 
          train_dataloader, 
          valid_dataloader, 
          steps, 
          optimizer,
          accelerator,
          blind_steps=None,
          loss_fn=torch.nn.BCELoss(),
          main_metric=('f1', f1_score), 
          additional_metrics=[],
          filepath='model_best_BERT.pt',
          load_best=True,
          scheduler=None,
          losses_dict=None):
    
    if blind_steps == None:
        blind_steps = len(train_dataloader) // 4
    
    def evaluate():  # the first score returned is the main
        model.eval()
        
        y_trues = []
        y_hats = []
        
        loss = 0
        k = 0
        
        with torch.no_grad():
            for batch in valid_dataloader:
                
                (ids, mask), y_true = batch
                ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
                y_true = accelerator.prepare(y_true)
                hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
                y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

                loss += float(loss_fn(y_hat, hots))
                k += 1
                
                for i in range(y_true.shape[0]):
                    y_trues.append(int(y_true[i]))
                    y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)
        
        scores = [(main_metric[0], main_metric[1](y_trues, y_hats))]
        
        for metric in additional_metrics:
            scores.append((metric[0], metric[1](y_trues, y_hats)))        
        
        model.train()
        return scores + [('valid_loss', loss/k)]
    
    
    def render_scores(scores, step, best=None):
        print('{:05d} steps'.format(step), end=' ')
        
        for score in scores:
            print("| {}: {:.3f}".format(*score), end=' ')
            
        if best != None:
            print('| best_score: {:.3f}'.format(best))
            
    
    # initial scores
    scores = evaluate()
    render_scores(scores, 0)
    best_score = scores[0][1]
    torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
    
    # logs
    if losses_dict != None:
        losses_dict['train_loss'] = []
        losses_dict['valid_loss'] = []
        losses_dict[main_metric[0]] = []
    
    epoch_loss = 0
    k = 0
    
    train_iter = iter(train_dataloader)
    model.train()
    
    for step in tqdm(range(steps)):
        
        # retrieving a batch
        try:
            batch = next(train_iter)
        except:
            train_iter = iter(train_dataloader)
            batch = next(train_iter)

        (ids, mask), y_true = batch
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        y_true = accelerator.prepare(y_true)

        # prediction
        y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)
        hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
        loss = loss_fn(y_hat, hots)
        
        # backprop
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        
        if scheduler != None:
            scheduler.step()
            
        epoch_loss += float(loss)
        k += 1
        
        # evaluation
        if (step + 1) % blind_steps == 0:
            scores = evaluate() + [('train_loss', epoch_loss/k)]
            
            if losses_dict != None:
                losses_dict['valid_loss'].append(float(scores[-2][1]))
                losses_dict['train_loss'].append(float(scores[-1][1]))
                losses_dict[main_metric[0]].append(float(scores[0][1]))
            
            if scores[0][1] > best_score:
                best_score = scores[0][1]
                torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
                
            render_scores(scores, step + 1, best=best_score)
            epoch_loss = 0
            k = 0
                
    if load_best:
        state_dict = torch.load(filepath)

        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if "module." not in k:
                name = 'module.' + k
                new_state_dict[name] = v

        model.load_state_dict(new_state_dict)

In [2]:
device = accelerator.device
model_checkpoint = "bert-large-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create an instance of your custom Dataset class
dataset = MyDataset(filtered_df, 'final_text', tokenizer)
train_dataset, valid_dataset = dataset.train_valid_split()

dataset_test = MyDatasetTest(filtered_test, 'final_text', tokenizer)

# Create a DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model = FinetuneClassifier(head_dropout=.1)
model = nn.DataParallel(model)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=500) 
logs_dict = {}

In [2]:
train(
  model, 
  train_dataloader, 
  valid_dataloader, 
  2000, 
  optimizer, 
  accelerator,
  blind_steps=100, 
  additional_metrics=[('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)],
  losses_dict=logs_dict,
  scheduler=scheduler
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

100%|████████████████████████████| 7485/7485 [00:12<00:00, 583.51it/s]
100%|███████████████████████████| 3263/3263 [00:02<00:00, 1165.81it/s]


Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.281 | precision: 0.398 | recall: 0.217 | accuracy: 0.525 | valid_loss: 0.691 

100%|██████████| 100/100 [03:19<00:00,  1.99s/it]

00100 steps | f1: 0.722 | precision: 0.784 | recall: 0.669 | accuracy: 0.780 | valid_loss: 0.489 | train_loss: 0.606 | best_score: 0.722





# RoBERTa Model

In [1]:
class MyDataset(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, target='target', max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append(((tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device)), torch.tensor(row[target]).to(device)))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def train_valid_split(self, train_fraction=.8, shuffle=True):
        num_train_examples = int(len(self) * train_fraction)
        train_dataset = copy.deepcopy(self)
        
        if shuffle:
            random.shuffle(train_dataset.data)
        
        valid_dataset = copy.deepcopy(train_dataset)
        train_dataset.data = train_dataset.data[:num_train_examples]
        valid_dataset.data = valid_dataset.data[num_train_examples:]
        
        return train_dataset, valid_dataset

NameError: name 'Dataset' is not defined

In [None]:
class MyDatasetTest(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append((torch.tensor(row['id']).to(device) ,(tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device))))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class FinetuneClassifier(nn.Module):
    def __init__(self, model=model_checkpoint, classes=2, head_dropout=0.2):
        super().__init__()
        
        self.model = RobertaModel.from_pretrained(model)
        hidden_size = self.model.config.hidden_size
        
        self.project = torch.nn.Sequential(
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, classes) # projection
        )

    def forward(self, input_ids, attention_mask=None):
        res = self.model.forward(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        res = res[0]
        res = res[:,0,:] # encoding for <s> token
        res = self.project(res)
        return res
    
    def parameters_num(self):
        return sum(p.numel() for p in self.parameters())

In [None]:
def train(model, 
          train_dataloader, 
          valid_dataloader, 
          steps, 
          optimizer,
          accelerator,
          blind_steps=None,
          loss_fn=torch.nn.BCELoss(),
          main_metric=('f1', f1_score), 
          additional_metrics=[],
          filepath='model_best_RoBERTa.pt',
          load_best=True,
          scheduler=None,
          losses_dict=None):
    
    if blind_steps == None:
        blind_steps = len(train_dataloader) // 4
    
    def evaluate():  # the first score returned is the main
        model.eval()
        
        y_trues = []
        y_hats = []
        
        loss = 0
        k = 0
        
        with torch.no_grad():
            for batch in valid_dataloader:
                
                (ids, mask), y_true = batch
                ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
                y_true = accelerator.prepare(y_true)
                hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
                y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

                loss += float(loss_fn(y_hat, hots))
                k += 1
                
                for i in range(y_true.shape[0]):
                    y_trues.append(int(y_true[i]))
                    y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)
        
        scores = [(main_metric[0], main_metric[1](y_trues, y_hats))]
        
        for metric in additional_metrics:
            scores.append((metric[0], metric[1](y_trues, y_hats)))        
        
        model.train()
        return scores + [('valid_loss', loss/k)]
    
    
    def render_scores(scores, step, best=None):
        print('{:05d} steps'.format(step), end=' ')
        
        for score in scores:
            print("| {}: {:.3f}".format(*score), end=' ')
            
        if best != None:
            print('| best_score: {:.3f}'.format(best))
            
    
    # initial scores
    scores = evaluate()
    render_scores(scores, 0)
    best_score = scores[0][1]
    torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
    
    # logs
    if losses_dict != None:
        losses_dict['train_loss'] = []
        losses_dict['valid_loss'] = []
        losses_dict[main_metric[0]] = []
    
    epoch_loss = 0
    k = 0
    
    train_iter = iter(train_dataloader)
    model.train()
    
    for step in tqdm(range(steps)):
        
        # retrieving a batch
        try:
            batch = next(train_iter)
        except:
            train_iter = iter(train_dataloader)
            batch = next(train_iter)

        (ids, mask), y_true = batch
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        y_true = accelerator.prepare(y_true)

        # prediction
        y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)
        hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
        loss = loss_fn(y_hat, hots)
        
        # backprop
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        
        if scheduler != None:
            scheduler.step()
            
        epoch_loss += float(loss)
        k += 1
        
        # evaluation
        if (step + 1) % blind_steps == 0:
            scores = evaluate() + [('train_loss', epoch_loss/k)]
            
            if losses_dict != None:
                losses_dict['valid_loss'].append(float(scores[-2][1]))
                losses_dict['train_loss'].append(float(scores[-1][1]))
                losses_dict[main_metric[0]].append(float(scores[0][1]))
            
            if scores[0][1] > best_score:
                best_score = scores[0][1]
                torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
                
            render_scores(scores, step + 1, best=best_score)
            epoch_loss = 0
            k = 0
                
    if load_best:
        state_dict = torch.load(filepath)

        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if "module." not in k:
                name = 'module.' + k
                new_state_dict[name] = v

        model.load_state_dict(new_state_dict)

In [1]:
device = accelerator.device
model_checkpoint = "roberta-large"

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
model = RobertaModel.from_pretrained(model_checkpoint)

# Create an instance of your custom Dataset class
dataset = MyDataset(filtered_df, 'final_text', tokenizer)
train_dataset, valid_dataset = dataset.train_valid_split()

dataset_test = MyDatasetTest(filtered_test, 'final_text', tokenizer)

# Create a DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model = FinetuneClassifier(head_dropout=.1)
model = nn.DataParallel(model)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-6, weight_decay=1.5e-3)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=500) 
logs_dict = {}

In [1]:
train(
  model, 
  train_dataloader, 
  valid_dataloader, 
  2000, 
  optimizer, 
  accelerator,
  blind_steps=100, 
  additional_metrics=[('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)],
  losses_dict=logs_dict,
  scheduler=scheduler
)

# RoBERTa Model with K-Fold

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, target='target', max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append(((tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device)), torch.tensor(row[target]).to(device)))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class MyDatasetTest(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append((torch.tensor(row['id']).to(device) ,(tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device))))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class FinetuneClassifier(nn.Module):
    def __init__(self, model=model_checkpoint, classes=2, head_dropout=0.2):
        super().__init__()
        
        self.model = RobertaModel.from_pretrained(model)
        hidden_size = self.model.config.hidden_size
        
        self.project = torch.nn.Sequential(
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, classes)
        )

    def forward(self, input_ids, attention_mask=None):
        res = self.model.forward(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        res = res[0]
        res = res[:,0,:] # encoding for <s> token
        res = self.project(res)
        return res
    
    def parameters_num(self):
        return sum(p.numel() for p in self.parameters())

In [None]:
def train(model, 
          train_dataloader, 
          valid_dataloader, 
          steps, 
          optimizer,
          accelerator,
          blind_steps=None,
          loss_fn=torch.nn.BCELoss(),
          main_metric=('f1', f1_score), 
          additional_metrics=[],
          filepath='model_best_RoBERTa.pt',
          load_best=True,
          scheduler=None,
          losses_dict=None):
    
    if blind_steps == None:
        blind_steps = len(train_dataloader) // 4
    
    def evaluate():  # the first score returned is the main
        model.eval()
        
        y_trues = []
        y_hats = []
        
        loss = 0
        k = 0
        
        with torch.no_grad():
            for batch in valid_dataloader:
                
                (ids, mask), y_true = batch
                ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
                y_true = accelerator.prepare(y_true)
                hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
                y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

                loss += float(loss_fn(y_hat, hots))
                k += 1
                
                for i in range(y_true.shape[0]):
                    y_trues.append(int(y_true[i]))
                    y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)
        
        scores = [(main_metric[0], main_metric[1](y_trues, y_hats))]
        
        for metric in additional_metrics:
            scores.append((metric[0], metric[1](y_trues, y_hats)))        
        
        model.train()
        return scores + [('valid_loss', loss/k)]
    
    
    def render_scores(scores, step, best=None):
        print('{:05d} steps'.format(step), end=' ')
        
        for score in scores:
            print("| {}: {:.3f}".format(*score), end=' ')
            
        if best != None:
            print('| best_score: {:.3f}'.format(best))
            
    
    # initial scores
    scores = evaluate()
    render_scores(scores, 0)
    best_score = scores[0][1]
    torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
    
    # logs
    if losses_dict != None:
        losses_dict['train_loss'] = []
        losses_dict['valid_loss'] = []
        losses_dict[main_metric[0]] = []
    
    epoch_loss = 0
    k = 0
    
    train_iter = iter(train_dataloader)
    model.train()
    
    for step in tqdm(range(steps)):
        
        # retrieving a batch
        try:
            batch = next(train_iter)
        except:
            train_iter = iter(train_dataloader)
            batch = next(train_iter)

        (ids, mask), y_true = batch
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        y_true = accelerator.prepare(y_true)

        # prediction
        y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)
        hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
        loss = loss_fn(y_hat, hots)
        
        # backprop
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        
        if scheduler != None:
            scheduler.step()
            
        epoch_loss += float(loss)
        k += 1
        
        # evaluation
        if (step + 1) % blind_steps == 0:
            scores = evaluate() + [('train_loss', epoch_loss/k)]
            
            if losses_dict != None:
                losses_dict['valid_loss'].append(float(scores[-2][1]))
                losses_dict['train_loss'].append(float(scores[-1][1]))
                losses_dict[main_metric[0]].append(float(scores[0][1]))
            
            if scores[0][1] > best_score:
                best_score = scores[0][1]
                torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
                
            render_scores(scores, step + 1, best=best_score)
            epoch_loss = 0
            k = 0
                
    if load_best:
        state_dict = torch.load(filepath)

        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if "module." not in k:
                name = 'module.' + k
                new_state_dict[name] = v

        model.load_state_dict(new_state_dict)

In [1]:
device = accelerator.device
model_checkpoint = "roberta-large"

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
model = RobertaModel.from_pretrained(model_checkpoint)

# Define the batch size and the number of folds
batch_size = 16
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds)

# Iterating over each fold
for fold, (train_index, valid_index) in enumerate(skf.split(filtered_df, filtered_df['target'])):

    print(f'FOLD {fold + 1}')
    
    # Split the data into train and validation datasets for the current fold
    train_dataset = MyDataset(filtered_df.iloc[train_index], 'final_text', tokenizer)
    valid_dataset = MyDataset(filtered_df.iloc[valid_index], 'final_text', tokenizer)
    
    # Create DataLoaders for the current fold
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model for the current fold
    model = FinetuneClassifier(head_dropout=.1)
    model = nn.DataParallel(model)
    model.to(device)
    
    # Initialize optimizer for the current fold
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-6, weight_decay=1e-3)
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=500) 
    logs_dict = {}
    
    # Train the model for the current fold
    train(
      model, 
      train_dataloader, 
      valid_dataloader, 
      1500, 
      optimizer, 
      accelerator,
      blind_steps=100, 
      additional_metrics=[('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)],
      filepath=f'model_best_RoBERTa_fold_{fold}.pt',
      losses_dict=logs_dict,
      scheduler=scheduler
    )

dataset_test = MyDatasetTest(filtered_test, 'final_text', tokenizer)

for fold in range(n_folds):
    model_path = f'/kaggle/working/model_best_RoBERTa_fold_{fold}.pt'
    state_dict = torch.load(model_path)

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if 'module.' not in k:
            name = 'module.' + k
            new_state_dict[name] = v

    model.load_state_dict(new_state_dict)
    
    predictions_df = pd.DataFrame()
    for i, (ids, mask) in tqdm(dataset_test):
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        pred = model(input_ids=ids[None], attention_mask=mask[None])[0]
        y_hat = 1 if pred[0] < pred[1] else 0
        r = [int(i), y_hat]
        predictions_df = pd.concat([predictions_df, pd.DataFrame(np.array(r)[None,:], columns=['id', 'target'])])
    
    # Add the fold's predictions to the combined predictions DataFrame
    predictions_df.columns = ['id', f'target_fold_{fold}']
    if fold == 0:
        combined_predictions = predictions_df
    else:
        combined_predictions = combined_predictions.merge(predictions_df, on='id')

combined_predictions['target'] = combined_predictions.iloc[:, 1:].mean(axis=1).round().astype(int)
combined_predictions[['id', 'target']].to_csv('submission.csv', index=False)

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['clean_text'] = filtered_df['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['combined'] = filtered_df.apply(combine_columns, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['final_text'] =  filtered_df['clean_text']+' '+ filtered_d

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD 1


100%|████████████████████████████| 5988/5988 [00:14<00:00, 417.96it/s]
100%|████████████████████████████| 1497/1497 [00:01<00:00, 787.64it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.598 | precision: 0.426 | recall: 1.000 | accuracy: 0.426 | valid_loss: 0.782 

  7%|▋         | 100/1500 [03:23<6:26:13, 16.55s/it]

00100 steps | f1: 0.539 | precision: 0.914 | recall: 0.382 | accuracy: 0.721 | valid_loss: 0.593 | train_loss: 0.701 | best_score: 0.598


 13%|█▎        | 200/1500 [06:51<6:18:06, 17.45s/it]

00200 steps | f1: 0.781 | precision: 0.882 | recall: 0.701 | accuracy: 0.832 | valid_loss: 0.408 | train_loss: 0.491 | best_score: 0.781


 20%|██        | 300/1500 [10:20<5:47:06, 17.36s/it]

00300 steps | f1: 0.783 | precision: 0.851 | recall: 0.726 | accuracy: 0.829 | valid_loss: 0.385 | train_loss: 0.430 | best_score: 0.783


 27%|██▋       | 400/1500 [13:47<5:19:17, 17.42s/it]

00400 steps | f1: 0.804 | precision: 0.823 | recall: 0.785 | accuracy: 0.836 | valid_loss: 0.397 | train_loss: 0.443 | best_score: 0.804


 33%|███▎      | 500/1500 [17:12<4:35:53, 16.55s/it]

00500 steps | f1: 0.791 | precision: 0.887 | recall: 0.713 | accuracy: 0.839 | valid_loss: 0.386 | train_loss: 0.386 | best_score: 0.804


 40%|████      | 600/1500 [20:37<4:07:32, 16.50s/it]

00600 steps | f1: 0.794 | precision: 0.777 | recall: 0.810 | accuracy: 0.820 | valid_loss: 0.412 | train_loss: 0.391 | best_score: 0.804


 47%|████▋     | 700/1500 [24:01<3:40:12, 16.52s/it]

00700 steps | f1: 0.790 | precision: 0.794 | recall: 0.785 | accuracy: 0.822 | valid_loss: 0.430 | train_loss: 0.366 | best_score: 0.804


 53%|█████▎    | 800/1500 [27:25<3:12:42, 16.52s/it]

00800 steps | f1: 0.780 | precision: 0.901 | recall: 0.688 | accuracy: 0.835 | valid_loss: 0.380 | train_loss: 0.384 | best_score: 0.804


 60%|██████    | 900/1500 [30:50<2:44:50, 16.48s/it]

00900 steps | f1: 0.775 | precision: 0.900 | recall: 0.680 | accuracy: 0.832 | valid_loss: 0.401 | train_loss: 0.312 | best_score: 0.804


 67%|██████▋   | 1000/1500 [34:15<2:17:56, 16.55s/it]

01000 steps | f1: 0.788 | precision: 0.848 | recall: 0.735 | accuracy: 0.831 | valid_loss: 0.416 | train_loss: 0.305 | best_score: 0.804


 73%|███████▎  | 1100/1500 [37:39<1:50:24, 16.56s/it]

01100 steps | f1: 0.773 | precision: 0.888 | recall: 0.685 | accuracy: 0.829 | valid_loss: 0.399 | train_loss: 0.334 | best_score: 0.804


 80%|████████  | 1200/1500 [41:04<1:22:55, 16.59s/it]

01200 steps | f1: 0.785 | precision: 0.873 | recall: 0.713 | accuracy: 0.834 | valid_loss: 0.414 | train_loss: 0.271 | best_score: 0.804


 87%|████████▋ | 1300/1500 [44:28<55:06, 16.53s/it]  

01300 steps | f1: 0.777 | precision: 0.804 | recall: 0.752 | accuracy: 0.816 | valid_loss: 0.475 | train_loss: 0.255 | best_score: 0.804


 93%|█████████▎| 1400/1500 [47:52<27:32, 16.52s/it]

01400 steps | f1: 0.794 | precision: 0.833 | recall: 0.759 | accuracy: 0.832 | valid_loss: 0.435 | train_loss: 0.285 | best_score: 0.804


100%|██████████| 1500/1500 [51:16<00:00,  2.05s/it]

01500 steps | f1: 0.794 | precision: 0.850 | recall: 0.745 | accuracy: 0.835 | valid_loss: 0.418 | train_loss: 0.298 | best_score: 0.804





FOLD 2


100%|████████████████████████████| 5988/5988 [00:06<00:00, 867.99it/s]
100%|████████████████████████████| 1497/1497 [00:01<00:00, 938.40it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  _warn_prf(average, modifier, msg_start, len(result))


00000 steps | f1: 0.000 | precision: 0.000 | recall: 0.000 | accuracy: 0.574 | valid_loss: 0.699 

  7%|▋         | 100/1500 [03:28<6:46:52, 17.44s/it]

00100 steps | f1: 0.689 | precision: 0.682 | recall: 0.696 | accuracy: 0.732 | valid_loss: 0.550 | train_loss: 0.668 | best_score: 0.689


 13%|█▎        | 200/1500 [06:56<6:15:40, 17.34s/it]

00200 steps | f1: 0.767 | precision: 0.790 | recall: 0.745 | accuracy: 0.807 | valid_loss: 0.464 | train_loss: 0.480 | best_score: 0.767


 20%|██        | 300/1500 [10:21<5:30:27, 16.52s/it]

00300 steps | f1: 0.752 | precision: 0.820 | recall: 0.694 | accuracy: 0.805 | valid_loss: 0.479 | train_loss: 0.399 | best_score: 0.767


 27%|██▋       | 400/1500 [13:45<5:02:54, 16.52s/it]

00400 steps | f1: 0.748 | precision: 0.827 | recall: 0.683 | accuracy: 0.804 | valid_loss: 0.469 | train_loss: 0.380 | best_score: 0.767


 33%|███▎      | 500/1500 [17:10<4:35:16, 16.52s/it]

00500 steps | f1: 0.738 | precision: 0.922 | recall: 0.614 | accuracy: 0.814 | valid_loss: 0.459 | train_loss: 0.379 | best_score: 0.767


 40%|████      | 600/1500 [20:35<4:08:00, 16.53s/it]

00600 steps | f1: 0.754 | precision: 0.727 | recall: 0.782 | accuracy: 0.782 | valid_loss: 0.555 | train_loss: 0.348 | best_score: 0.767


 47%|████▋     | 700/1500 [24:00<3:40:20, 16.53s/it]

00700 steps | f1: 0.753 | precision: 0.766 | recall: 0.740 | accuracy: 0.793 | valid_loss: 0.536 | train_loss: 0.347 | best_score: 0.767


 53%|█████▎    | 800/1500 [27:24<3:12:51, 16.53s/it]

00800 steps | f1: 0.745 | precision: 0.754 | recall: 0.737 | accuracy: 0.786 | valid_loss: 0.552 | train_loss: 0.314 | best_score: 0.767


 60%|██████    | 900/1500 [30:49<2:45:27, 16.55s/it]

00900 steps | f1: 0.735 | precision: 0.756 | recall: 0.715 | accuracy: 0.780 | valid_loss: 0.591 | train_loss: 0.281 | best_score: 0.767


 67%|██████▋   | 1000/1500 [34:14<2:17:44, 16.53s/it]

01000 steps | f1: 0.754 | precision: 0.839 | recall: 0.685 | accuracy: 0.810 | valid_loss: 0.483 | train_loss: 0.309 | best_score: 0.767


 73%|███████▎  | 1100/1500 [37:39<1:50:14, 16.54s/it]

01100 steps | f1: 0.742 | precision: 0.746 | recall: 0.738 | accuracy: 0.782 | valid_loss: 0.555 | train_loss: 0.288 | best_score: 0.767


 80%|████████  | 1200/1500 [41:03<1:23:16, 16.66s/it]

01200 steps | f1: 0.740 | precision: 0.762 | recall: 0.719 | accuracy: 0.785 | valid_loss: 0.600 | train_loss: 0.259 | best_score: 0.767


 87%|████████▋ | 1300/1500 [44:28<55:13, 16.57s/it]  

01300 steps | f1: 0.730 | precision: 0.782 | recall: 0.685 | accuracy: 0.784 | valid_loss: 0.637 | train_loss: 0.222 | best_score: 0.767


 93%|█████████▎| 1400/1500 [47:52<27:36, 16.57s/it]

01400 steps | f1: 0.718 | precision: 0.774 | recall: 0.669 | accuracy: 0.776 | valid_loss: 0.674 | train_loss: 0.237 | best_score: 0.767


100%|██████████| 1500/1500 [51:16<00:00,  2.05s/it]

01500 steps | f1: 0.719 | precision: 0.698 | recall: 0.741 | accuracy: 0.753 | valid_loss: 0.671 | train_loss: 0.249 | best_score: 0.767





FOLD 3


100%|████████████████████████████| 5988/5988 [00:06<00:00, 925.69it/s]
100%|████████████████████████████| 1497/1497 [00:01<00:00, 924.53it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.275 | precision: 0.480 | recall: 0.193 | accuracy: 0.567 | valid_loss: 0.690 

  7%|▋         | 100/1500 [03:30<7:03:06, 18.13s/it]

00100 steps | f1: 0.631 | precision: 0.754 | recall: 0.542 | accuracy: 0.729 | valid_loss: 0.544 | train_loss: 0.659 | best_score: 0.631


 13%|█▎        | 200/1500 [06:57<6:17:48, 17.44s/it]

00200 steps | f1: 0.763 | precision: 0.816 | recall: 0.716 | accuracy: 0.810 | valid_loss: 0.459 | train_loss: 0.488 | best_score: 0.763


 20%|██        | 300/1500 [10:26<5:50:15, 17.51s/it]

00300 steps | f1: 0.772 | precision: 0.832 | recall: 0.721 | accuracy: 0.819 | valid_loss: 0.451 | train_loss: 0.414 | best_score: 0.772


 27%|██▋       | 400/1500 [13:56<5:34:22, 18.24s/it]

00400 steps | f1: 0.776 | precision: 0.785 | recall: 0.766 | accuracy: 0.811 | valid_loss: 0.441 | train_loss: 0.380 | best_score: 0.776


 33%|███▎      | 500/1500 [17:23<4:48:44, 17.32s/it]

00500 steps | f1: 0.782 | precision: 0.848 | recall: 0.726 | accuracy: 0.828 | valid_loss: 0.425 | train_loss: 0.363 | best_score: 0.782


 40%|████      | 600/1500 [20:48<4:07:47, 16.52s/it]

00600 steps | f1: 0.776 | precision: 0.787 | recall: 0.765 | accuracy: 0.812 | valid_loss: 0.477 | train_loss: 0.357 | best_score: 0.782


 47%|████▋     | 700/1500 [24:13<3:40:18, 16.52s/it]

00700 steps | f1: 0.782 | precision: 0.841 | recall: 0.730 | accuracy: 0.826 | valid_loss: 0.424 | train_loss: 0.352 | best_score: 0.782


 53%|█████▎    | 800/1500 [27:37<3:12:34, 16.51s/it]

00800 steps | f1: 0.773 | precision: 0.759 | recall: 0.787 | accuracy: 0.803 | valid_loss: 0.495 | train_loss: 0.325 | best_score: 0.782


 60%|██████    | 900/1500 [31:02<2:45:05, 16.51s/it]

00900 steps | f1: 0.768 | precision: 0.789 | recall: 0.749 | accuracy: 0.808 | valid_loss: 0.509 | train_loss: 0.315 | best_score: 0.782


 67%|██████▋   | 1000/1500 [34:27<2:17:44, 16.53s/it]

01000 steps | f1: 0.759 | precision: 0.719 | recall: 0.803 | accuracy: 0.782 | valid_loss: 0.598 | train_loss: 0.296 | best_score: 0.782


 73%|███████▎  | 1100/1500 [37:52<1:50:12, 16.53s/it]

01100 steps | f1: 0.762 | precision: 0.708 | recall: 0.824 | accuracy: 0.780 | valid_loss: 0.554 | train_loss: 0.319 | best_score: 0.782


 80%|████████  | 1200/1500 [41:16<1:22:35, 16.52s/it]

01200 steps | f1: 0.749 | precision: 0.676 | recall: 0.839 | accuracy: 0.760 | valid_loss: 0.679 | train_loss: 0.261 | best_score: 0.782


 87%|████████▋ | 1300/1500 [44:41<55:04, 16.52s/it]  

01300 steps | f1: 0.766 | precision: 0.748 | recall: 0.785 | accuracy: 0.796 | valid_loss: 0.610 | train_loss: 0.249 | best_score: 0.782


 93%|█████████▎| 1400/1500 [48:06<27:48, 16.69s/it]

01400 steps | f1: 0.762 | precision: 0.732 | recall: 0.795 | accuracy: 0.788 | valid_loss: 0.604 | train_loss: 0.242 | best_score: 0.782


100%|██████████| 1500/1500 [51:30<00:00,  2.06s/it]

01500 steps | f1: 0.736 | precision: 0.674 | recall: 0.812 | accuracy: 0.752 | valid_loss: 0.744 | train_loss: 0.248 | best_score: 0.782





FOLD 4


100%|████████████████████████████| 5988/5988 [00:06<00:00, 935.23it/s]
100%|████████████████████████████| 1497/1497 [00:01<00:00, 956.38it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.598 | precision: 0.432 | recall: 0.972 | accuracy: 0.444 | valid_loss: 0.696 

  7%|▋         | 100/1500 [03:35<7:38:40, 19.66s/it]

00100 steps | f1: 0.627 | precision: 0.856 | recall: 0.495 | accuracy: 0.749 | valid_loss: 0.606 | train_loss: 0.675 | best_score: 0.627


 13%|█▎        | 200/1500 [07:10<7:04:56, 19.61s/it]

00200 steps | f1: 0.771 | precision: 0.720 | recall: 0.830 | accuracy: 0.790 | valid_loss: 0.487 | train_loss: 0.505 | best_score: 0.771


 20%|██        | 300/1500 [10:35<5:30:23, 16.52s/it]

00300 steps | f1: 0.768 | precision: 0.796 | recall: 0.743 | accuracy: 0.810 | valid_loss: 0.459 | train_loss: 0.438 | best_score: 0.771


 27%|██▋       | 400/1500 [14:09<5:59:10, 19.59s/it]

00400 steps | f1: 0.772 | precision: 0.795 | recall: 0.750 | accuracy: 0.812 | valid_loss: 0.425 | train_loss: 0.404 | best_score: 0.772


 33%|███▎      | 500/1500 [17:38<4:51:55, 17.52s/it]

00500 steps | f1: 0.780 | precision: 0.795 | recall: 0.766 | accuracy: 0.816 | valid_loss: 0.414 | train_loss: 0.382 | best_score: 0.780


 40%|████      | 600/1500 [21:03<4:07:05, 16.47s/it]

00600 steps | f1: 0.779 | precision: 0.723 | recall: 0.846 | accuracy: 0.796 | valid_loss: 0.490 | train_loss: 0.380 | best_score: 0.780


 47%|████▋     | 700/1500 [24:28<3:39:34, 16.47s/it]

00700 steps | f1: 0.777 | precision: 0.800 | recall: 0.755 | accuracy: 0.816 | valid_loss: 0.421 | train_loss: 0.368 | best_score: 0.780


 53%|█████▎    | 800/1500 [28:01<3:47:14, 19.48s/it]

00800 steps | f1: 0.781 | precision: 0.769 | recall: 0.793 | accuracy: 0.810 | valid_loss: 0.445 | train_loss: 0.312 | best_score: 0.781


 60%|██████    | 900/1500 [31:37<3:14:44, 19.47s/it]

00900 steps | f1: 0.782 | precision: 0.761 | recall: 0.804 | accuracy: 0.809 | valid_loss: 0.439 | train_loss: 0.343 | best_score: 0.782


 67%|██████▋   | 1000/1500 [35:02<2:17:32, 16.50s/it]

01000 steps | f1: 0.766 | precision: 0.768 | recall: 0.765 | accuracy: 0.802 | valid_loss: 0.459 | train_loss: 0.335 | best_score: 0.782


 73%|███████▎  | 1100/1500 [38:27<1:50:12, 16.53s/it]

01100 steps | f1: 0.762 | precision: 0.891 | recall: 0.666 | accuracy: 0.823 | valid_loss: 0.454 | train_loss: 0.309 | best_score: 0.782


 80%|████████  | 1200/1500 [41:51<1:22:40, 16.54s/it]

01200 steps | f1: 0.758 | precision: 0.714 | recall: 0.807 | accuracy: 0.780 | valid_loss: 0.498 | train_loss: 0.303 | best_score: 0.782


 87%|████████▋ | 1300/1500 [45:16<55:06, 16.53s/it]  

01300 steps | f1: 0.765 | precision: 0.768 | recall: 0.763 | accuracy: 0.801 | valid_loss: 0.487 | train_loss: 0.271 | best_score: 0.782


 93%|█████████▎| 1400/1500 [48:41<27:33, 16.54s/it]

01400 steps | f1: 0.757 | precision: 0.831 | recall: 0.695 | accuracy: 0.810 | valid_loss: 0.515 | train_loss: 0.252 | best_score: 0.782


100%|██████████| 1500/1500 [52:06<00:00,  2.08s/it]

01500 steps | f1: 0.760 | precision: 0.761 | recall: 0.760 | accuracy: 0.796 | valid_loss: 0.485 | train_loss: 0.274 | best_score: 0.782





FOLD 5


100%|████████████████████████████| 5988/5988 [00:06<00:00, 903.82it/s]
100%|████████████████████████████| 1497/1497 [00:01<00:00, 924.17it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.597 | precision: 0.426 | recall: 1.000 | accuracy: 0.426 | valid_loss: 0.708 

  7%|▋         | 100/1500 [03:35<7:37:14, 19.60s/it]

00100 steps | f1: 0.740 | precision: 0.843 | recall: 0.659 | accuracy: 0.803 | valid_loss: 0.515 | train_loss: 0.650 | best_score: 0.740


 13%|█▎        | 200/1500 [07:10<7:02:49, 19.51s/it]

00200 steps | f1: 0.812 | precision: 0.894 | recall: 0.744 | accuracy: 0.854 | valid_loss: 0.398 | train_loss: 0.483 | best_score: 0.812


 20%|██        | 300/1500 [10:44<6:29:21, 19.47s/it]

00300 steps | f1: 0.815 | precision: 0.770 | recall: 0.867 | accuracy: 0.833 | valid_loss: 0.435 | train_loss: 0.430 | best_score: 0.815


 27%|██▋       | 400/1500 [14:12<5:22:39, 17.60s/it]

00400 steps | f1: 0.822 | precision: 0.816 | recall: 0.829 | accuracy: 0.848 | valid_loss: 0.375 | train_loss: 0.424 | best_score: 0.822


 33%|███▎      | 500/1500 [17:37<4:35:30, 16.53s/it]

00500 steps | f1: 0.799 | precision: 0.890 | recall: 0.725 | accuracy: 0.845 | valid_loss: 0.371 | train_loss: 0.386 | best_score: 0.822


 40%|████      | 600/1500 [21:02<4:08:07, 16.54s/it]

00600 steps | f1: 0.798 | precision: 0.728 | recall: 0.882 | accuracy: 0.810 | valid_loss: 0.445 | train_loss: 0.378 | best_score: 0.822


 47%|████▋     | 700/1500 [24:27<3:41:09, 16.59s/it]

00700 steps | f1: 0.821 | precision: 0.854 | recall: 0.790 | accuracy: 0.853 | valid_loss: 0.363 | train_loss: 0.401 | best_score: 0.822


 53%|█████▎    | 800/1500 [28:02<3:48:45, 19.61s/it]

00800 steps | f1: 0.828 | precision: 0.852 | recall: 0.805 | accuracy: 0.858 | valid_loss: 0.357 | train_loss: 0.368 | best_score: 0.828


 60%|██████    | 900/1500 [31:27<2:45:28, 16.55s/it]

00900 steps | f1: 0.821 | precision: 0.857 | recall: 0.788 | accuracy: 0.854 | valid_loss: 0.369 | train_loss: 0.332 | best_score: 0.828


 67%|██████▋   | 1000/1500 [34:55<2:25:34, 17.47s/it]

01000 steps | f1: 0.830 | precision: 0.860 | recall: 0.802 | accuracy: 0.860 | valid_loss: 0.370 | train_loss: 0.317 | best_score: 0.830


 73%|███████▎  | 1100/1500 [38:20<1:49:39, 16.45s/it]

01100 steps | f1: 0.817 | precision: 0.784 | recall: 0.854 | accuracy: 0.838 | valid_loss: 0.404 | train_loss: 0.360 | best_score: 0.830


 80%|████████  | 1200/1500 [41:44<1:22:34, 16.52s/it]

01200 steps | f1: 0.795 | precision: 0.715 | recall: 0.896 | accuracy: 0.804 | valid_loss: 0.563 | train_loss: 0.280 | best_score: 0.830


 87%|████████▋ | 1300/1500 [45:09<55:02, 16.51s/it]  

01300 steps | f1: 0.817 | precision: 0.811 | recall: 0.823 | accuracy: 0.843 | valid_loss: 0.403 | train_loss: 0.311 | best_score: 0.830


 93%|█████████▎| 1400/1500 [48:35<27:36, 16.56s/it]

01400 steps | f1: 0.814 | precision: 0.821 | recall: 0.807 | accuracy: 0.843 | valid_loss: 0.406 | train_loss: 0.306 | best_score: 0.830


100%|██████████| 1500/1500 [51:59<00:00,  2.08s/it]

01500 steps | f1: 0.805 | precision: 0.747 | recall: 0.873 | accuracy: 0.820 | valid_loss: 0.504 | train_loss: 0.294 | best_score: 0.830



100%|███████████████████████████| 3263/3263 [00:03<00:00, 1068.31it/s]
100%|██████████| 3263/3263 [04:26<00:00, 12.23it/s]
100%|██████████| 3263/3263 [04:27<00:00, 12.20it/s]
100%|██████████| 3263/3263 [04:31<00:00, 12.01it/s]
100%|██████████| 3263/3263 [04:31<00:00, 12.01it/s]
100%|██████████| 3263/3263 [04:31<00:00, 12.03it/s]


# XLNet Model

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, target='target', max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append(((tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device)), torch.tensor(row[target]).to(device)))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def train_valid_split(self, train_fraction=.8, shuffle=True):
        num_train_examples = int(len(self) * train_fraction)
        train_dataset = copy.deepcopy(self)
        
        if shuffle:
            random.shuffle(train_dataset.data)
        
        valid_dataset = copy.deepcopy(train_dataset)
        train_dataset.data = train_dataset.data[:num_train_examples]
        valid_dataset.data = valid_dataset.data[num_train_examples:]
        
        return train_dataset, valid_dataset

In [None]:
class MyDatasetTest(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append((torch.tensor(row['id']).to(device) ,(tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device))))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class FinetuneClassifier(nn.Module):
    def __init__(self, model=model_checkpoint, classes=2, head_dropout=0.2):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model)
        hidden_size = self.model.config.hidden_size
        
        self.project = torch.nn.Sequential(
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, classes) # projection
        )

    def forward(self, input_ids, attention_mask=None):
        res = self.model.forward(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        res = res[0]
        res = res[:,0,:] # encoding for <s> token
        res = self.project(res)
        return res
    
    def parameters_num(self):
        return sum(p.numel() for p in self.parameters())

In [None]:
def train(model, 
          train_dataloader, 
          valid_dataloader, 
          steps, 
          optimizer,
          accelerator,
          blind_steps=None,
          loss_fn=torch.nn.BCELoss(),
          main_metric=('f1', f1_score), 
          additional_metrics=[],
          filepath='model_best_XLNet.pt',
          load_best=True,
          scheduler=None,
          losses_dict=None):
    
    if blind_steps == None:
        blind_steps = len(train_dataloader) // 4
    
    def evaluate():  # the first score returned is the main
        model.eval()
        
        y_trues = []
        y_hats = []
        
        loss = 0
        k = 0
        
        with torch.no_grad():
            for batch in valid_dataloader:
                
                (ids, mask), y_true = batch
                ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
                y_true = accelerator.prepare(y_true)
                hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
                y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

                loss += float(loss_fn(y_hat, hots))
                k += 1
                
                for i in range(y_true.shape[0]):
                    y_trues.append(int(y_true[i]))
                    y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)
        
        scores = [(main_metric[0], main_metric[1](y_trues, y_hats))]
        
        for metric in additional_metrics:
            scores.append((metric[0], metric[1](y_trues, y_hats)))        
        
        model.train()
        return scores + [('valid_loss', loss/k)]
    
    
    def render_scores(scores, step, best=None):
        print('{:05d} steps'.format(step), end=' ')
        
        for score in scores:
            print("| {}: {:.3f}".format(*score), end=' ')
            
        if best != None:
            print('| best_score: {:.3f}'.format(best))
            
    
    # initial scores
    scores = evaluate()
    render_scores(scores, 0)
    best_score = scores[0][1]
    torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
    
    # logs
    if losses_dict != None:
        losses_dict['train_loss'] = []
        losses_dict['valid_loss'] = []
        losses_dict[main_metric[0]] = []
    
    epoch_loss = 0
    k = 0
    
    train_iter = iter(train_dataloader)
    model.train()
    
    for step in tqdm(range(steps)):
        
        # retrieving a batch
        try:
            batch = next(train_iter)
        except:
            train_iter = iter(train_dataloader)
            batch = next(train_iter)

        (ids, mask), y_true = batch
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        y_true = accelerator.prepare(y_true)

        # prediction
        y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)
        hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
        loss = loss_fn(y_hat, hots)
        
        # backprop
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        
        if scheduler != None:
            scheduler.step()
            
        epoch_loss += float(loss)
        k += 1
        
        # evaluation
        if (step + 1) % blind_steps == 0:
            scores = evaluate() + [('train_loss', epoch_loss/k)]
            
            if losses_dict != None:
                losses_dict['valid_loss'].append(float(scores[-2][1]))
                losses_dict['train_loss'].append(float(scores[-1][1]))
                losses_dict[main_metric[0]].append(float(scores[0][1]))
            
            if scores[0][1] > best_score:
                best_score = scores[0][1]
                torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
                
            render_scores(scores, step + 1, best=best_score)
            epoch_loss = 0
            k = 0
                
    if load_best:
        state_dict = torch.load(filepath)

        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if "module." not in k:
                name = 'module.' + k
                new_state_dict[name] = v

        model.load_state_dict(new_state_dict)

In [2]:
device = accelerator.device
model_checkpoint = "xlnet-large-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create an instance of your custom Dataset class
dataset = MyDataset(filtered_df, 'final_text', tokenizer)
train_dataset, valid_dataset = dataset.train_valid_split()

dataset_test = MyDatasetTest(filtered_test, 'final_text', tokenizer)

# Create a DataLoader
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model = FinetuneClassifier(head_dropout=.1)
model = nn.DataParallel(model)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=500) 
logs_dict = {}

In [2]:
train(
  model, 
  train_dataloader, 
  valid_dataloader, 
  2000, 
  optimizer, 
  accelerator,
  blind_steps=100, 
  additional_metrics=[('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)],
  losses_dict=logs_dict,
  scheduler=scheduler
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

100%|████████████████████████████| 7485/7485 [00:17<00:00, 418.90it/s]
100%|████████████████████████████| 3263/3263 [00:03<00:00, 967.56it/s]


Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


00000 steps | f1: 0.397 | precision: 0.437 | recall: 0.363 | accuracy: 0.524 | valid_loss: 0.872 

100%|██████████| 100/100 [03:58<00:00,  2.38s/it]

00100 steps | f1: 0.312 | precision: 0.608 | recall: 0.210 | accuracy: 0.602 | valid_loss: 0.651 | train_loss: 0.870 | best_score: 0.397





# ELECTRA Model

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, target='target', max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append(((tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device)), torch.tensor(row[target]).to(device)))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def train_valid_split(self, train_fraction=.8, shuffle=True):
        num_train_examples = int(len(self) * train_fraction)
        train_dataset = copy.deepcopy(self)
        
        if shuffle:
            random.shuffle(train_dataset.data)
        
        valid_dataset = copy.deepcopy(train_dataset)
        train_dataset.data = train_dataset.data[:num_train_examples]
        valid_dataset.data = valid_dataset.data[num_train_examples:]
        
        return train_dataset, valid_dataset

In [None]:
class MyDatasetTest(Dataset):
    def __init__(self, dataframe, text_column, tokenizer, max_length = 256):
        self.data = []
        
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), ncols=70):
            text = row[text_column]
            tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
            self.data.append((torch.tensor(row['id']).to(device) ,(tokenized['input_ids'][0].to(device), tokenized['attention_mask'][0].to(device))))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class FinetuneClassifier(nn.Module):
    def __init__(self, model=model_checkpoint, classes=2, head_dropout=0.2):
        super().__init__()

        self.model = ElectraModel.from_pretrained(model)
        hidden_size = self.model.config.hidden_size

        self.project = torch.nn.Sequential(
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.Dropout(head_dropout),
            torch.nn.Linear(hidden_size, classes) 
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]
        cls_output = last_hidden_state[:, 0, :]
        cls_output = self.project(cls_output)
        return cls_output

    def parameters_num(self):
        return sum(p.numel() for p in self.parameters())

In [None]:
def train(model, 
          train_dataloader, 
          valid_dataloader, 
          steps, 
          optimizer,
          accelerator,
          blind_steps=None,
          loss_fn=torch.nn.BCELoss(),
          main_metric=('f1', f1_score), 
          additional_metrics=[],
          filepath='model_best_ELECTRA.pt',
          load_best=True,
          scheduler=None,
          losses_dict=None):
    
    if blind_steps == None:
        blind_steps = len(train_dataloader) // 4
    
    def evaluate():  # the first score returned is the main
        model.eval()
        
        y_trues = []
        y_hats = []
        
        loss = 0
        k = 0
        
        with torch.no_grad():
            for batch in valid_dataloader:
                
                (ids, mask), y_true = batch
                ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
                y_true = accelerator.prepare(y_true)
                hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
                y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

                loss += float(loss_fn(y_hat, hots))
                k += 1
                
                for i in range(y_true.shape[0]):
                    y_trues.append(int(y_true[i]))
                    y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)
        
        scores = [(main_metric[0], main_metric[1](y_trues, y_hats))]
        
        for metric in additional_metrics:
            scores.append((metric[0], metric[1](y_trues, y_hats)))        
        
        model.train()
        return scores + [('valid_loss', loss/k)]
    
    
    def render_scores(scores, step, best=None):
        print('{:05d} steps'.format(step), end=' ')
        
        for score in scores:
            print("| {}: {:.3f}".format(*score), end=' ')
            
        if best != None:
            print('| best_score: {:.3f}'.format(best))
            
    
    # initial scores
    scores = evaluate()
    render_scores(scores, 0)
    best_score = scores[0][1]
    torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
    
    # logs
    if losses_dict != None:
        losses_dict['train_loss'] = []
        losses_dict['valid_loss'] = []
        losses_dict[main_metric[0]] = []
    
    epoch_loss = 0
    k = 0
    
    train_iter = iter(train_dataloader)
    model.train()
    
    for step in tqdm(range(steps)):
        
        # retrieving a batch
        try:
            batch = next(train_iter)
        except:
            train_iter = iter(train_dataloader)
            batch = next(train_iter)

        (ids, mask), y_true = batch
        ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
        y_true = accelerator.prepare(y_true)

        # prediction
        y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)
        hots = torch.nn.functional.one_hot(y_true, 2).to(dtype=torch.float)
        loss = loss_fn(y_hat, hots)
        
        # backprop
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        
        if scheduler != None:
            scheduler.step()
            
        epoch_loss += float(loss)
        k += 1
        
        # evaluation
        if (step + 1) % blind_steps == 0:
            scores = evaluate() + [('train_loss', epoch_loss/k)]
            
            if losses_dict != None:
                losses_dict['valid_loss'].append(float(scores[-2][1]))
                losses_dict['train_loss'].append(float(scores[-1][1]))
                losses_dict[main_metric[0]].append(float(scores[0][1]))
            
            if scores[0][1] > best_score:
                best_score = scores[0][1]
                torch.save(accelerator.unwrap_model(model).state_dict(), filepath)
                
            render_scores(scores, step + 1, best=best_score)
            epoch_loss = 0
            k = 0
                
    if load_best:
        state_dict = torch.load(filepath)

        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if "module." not in k:
                name = 'module.' + k
                new_state_dict[name] = v

        model.load_state_dict(new_state_dict)

In [None]:
device = accelerator.device
model_checkpoint = 'google/electra-base-discriminator'

tokenizer = ElectraTokenizer.from_pretrained(model_checkpoint)
model = ElectraForSequenceClassification.from_pretrained(model_checkpoint)

# Create an instance of your custom Dataset class
dataset = MyDataset(filtered_df, 'final_text', tokenizer)
train_dataset, valid_dataset = dataset.train_valid_split()

dataset_test = MyDatasetTest(filtered_test, 'final_text', tokenizer)

# Create a DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model = FinetuneClassifier(head_dropout=.1)
model = nn.DataParallel(model)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=500) 
logs_dict = {}

In [None]:
train(
  model, 
  train_dataloader, 
  valid_dataloader, 
  2000, 
  optimizer, 
  accelerator,
  blind_steps=100, 
  additional_metrics=[('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)],
  losses_dict=logs_dict,
  scheduler=scheduler
)

# Predicting

In [None]:
model_name = 'BERT'

In [None]:
state_dict = torch.load(f'/kaggle/working/model_best_{model_name}.pt')

new_state_dict = OrderedDict()
for k, v in state_dict.items():
    if 'module.' not in k:
        name = 'module.' + k
        new_state_dict[name] = v
    
model.load_state_dict(new_state_dict)

def evaluate(model, valid_dataloader, metrics=[('f1', f1_score),('precision', precision_score), ('recall', recall_score),('accuracy', accuracy_score)]):
    model.eval()

    y_trues = []
    y_hats = []

    with torch.no_grad():
        for batch in valid_dataloader:

            (ids, mask), y_true = batch
            y_hat = torch.softmax(model.forward(input_ids=ids, attention_mask=mask),dim=-1)

            for i in range(y_true.shape[0]):
                y_trues.append(int(y_true[i]))
                y_hats.append(1 if y_hat[i][0] < y_hat[i][1] else 0)

    scores = []

    for metric in metrics:
        scores.append((metric[0], metric[1](y_trues, y_hats)))        
 
    return scores

scores = evaluate(model, valid_dataloader)
print(scores)

predictions_df = pd.DataFrame()
for i, (ids, mask) in tqdm(dataset_test):
    ids, mask = accelerator.prepare(ids), accelerator.prepare(mask)
    pred = model(input_ids=ids[None], attention_mask=mask[None])[0]
    y_hat = 1 if pred[0] < pred[1] else 0
    r = [int(i), y_hat]
    predictions_df = pd.concat([predictions_df, pd.DataFrame(np.array(r)[None,:], columns=['id', 'target'])])

predictions_df.to_csv('submission.csv', index=False)