In [1]:
# !pip3 install transformers

In [2]:
import pandas as pd 
import numpy as np
from transformers import BertModel, RobertaModel, AutoTokenizer, AutoConfig, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import os
import random
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

from torch.optim import AdamW
import warnings
warnings.filterwarnings(action='ignore')

from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import KFold, StratifiedKFold
import gc

In [3]:
platform = 'Kaggle'
train_type = 'val'
model_2_train = 'roberta-large'

model_suffix = 'Roberta_large_model.bin'

if platform=='Kaggle':
    train_path = "../input/commonlitmodels/train_5_fold_CV.csv"
    test_path = "../input/commonlitreadabilityprize/test.csv"
    bert_path = '../input/huggingface-bert/bert-base-uncased'
    roberta_base_path = '../input/huggingface-roberta/roberta-base'
    roberta_large_path = '../input/huggingface-roberta/roberta-large'
    model_dir = './'
else:
    train_path = '../data/train_5_fold_CV.csv'
    test_path =  '../data/test.csv'
    roberta_base_path = 'roberta-base'
    roberta_large_path = 'roberta-large'
    model_dir = "../models/"
    
    
HIDDEN_SIZE = 1024
config = {'train_path':train_path, 
          'test_path':test_path, 
          'device': 'cuda' if torch.cuda.is_available() else 'cpu',
          'tokenizer': AutoTokenizer.from_pretrained(roberta_large_path, lower=True), 
          'model_dir': model_dir,
          'model_suffix':model_suffix
         }

In [4]:
def read_dataset(path):
    
    temp = pd.read_csv(path)
    return temp
    

In [5]:
train_df = read_dataset(path=config['train_path'])
test_df = read_dataset(path=config['test_path'])
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,tb_fold,abt_fold
0,4f53dd071,https://simple.wikipedia.org/wiki/Astronomy,CC BY-SA 3.0 and GFDL,Astronomy is a natural science. It is the stud...,-0.123139,0.537258,4,1
1,29cd28197,https://en.wikipedia.org/wiki/Light-year,CC BY-SA 3.0,The light-year is a unit of length used to exp...,-3.256312,0.581264,4,1
2,924cdefd9,,,"He flung his pick out of the trench, climbed o...",-1.100205,0.487602,3,1
3,8d406094c,,,I am sorry to hear that Bulgaria demands conce...,-1.565674,0.504832,3,1
4,3c1674b21,https://kids.frontiersin.org/article/10.3389/f...,CC BY 4.0,"To practice their knowledge and skills, doctor...",-0.878021,0.446454,2,1
...,...,...,...,...,...,...,...,...
2829,1c9fe933e,,,Between his apprehension and his trial no frie...,-1.999922,0.463280,2,5
2830,6b9d9517c,,,Among the most subtle and skillful of all the ...,-1.877338,0.452149,1,5
2831,ebac871b7,https://simple.wikipedia.org/wiki/Lithosphere,CC BY-SA 3.0 and GFDL,There are two types of lithosphere:\nOceanic l...,-2.366802,0.498877,2,5
2832,84fba32e2,https://en.wikipedia.org/wiki/Moon,CC BY-SA 3.0,The Moon is Earth's only permanent natural sat...,-0.518585,0.494256,4,5


In [6]:
# train_df = train_df.sample(200).reset_index(drop=True)

In [8]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        config = AutoConfig.from_pretrained(roberta_large_path)
        self.model = AutoModel.from_pretrained(roberta_large_path, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, ids, mask):
        
        outputs = self.model(ids, mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        return preds
    
class ConcatenateLastFourModel(nn.Module):
    
    def __init__(self, ):
        super().__init__()
        
        config = AutoConfig.from_pretrained(roberta_large_path)
        config.update({'output_hidden_states':True})
        self.model = AutoModel.from_pretrained(roberta_large_path, config=config)
        self.linear = nn.Linear(4*HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, ids, mask):
        
        outputs = self.model(ids, mask)
        all_hidden_states = torch.stack(outputs[2])
        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]), -1
        )
        concatenate_pooling = concatenate_pooling[:,0]
        logits = self.linear(concatenate_pooling)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        return preds

In [9]:
class form_input():
    def __init__(self, row_ID, sentence, target, data_type):
        self.row_ID = row_ID
        self.sentence=sentence
        self.target = target
        self.data_type=data_type
    
    def __len__(self):
        return len(self.sentence)
    
    def __getitem__(self, item):
        toks = config['tokenizer'].encode_plus(self.sentence[item])
        
        ids = toks['input_ids']
        att_mask = toks['attention_mask']
        label = self.target[item]
        
        pad_len = params_dict['Max_length'] - len(ids)
        
        # Padding.
        if len(ids)>params_dict['Max_length']:
            ids = ids[:params_dict['Max_length']]
            att_mask = att_mask[:params_dict['Max_length']]
        else:
            ids = ids + [2]*pad_len
            att_mask = att_mask + [0]*pad_len
            
        ################################################
        if ((self.data_type=='train') | (self.data_type=='valid')):
            target = self.target[item]
        else:
            target = 1
                    
        return {#'row_ID': torch.tensor(self.row_ID[item], dtype=torch.long), 
#                 'sentence': torch.tensor(self.sentence[item], dtype=torch.long), 
                'ids': torch.tensor(ids, dtype=torch.long), 
                'att_mask': torch.tensor(att_mask, dtype=torch.long), 
                'target': torch.tensor(target, dtype=torch.float) }
        

In [10]:
def get_input(df, data_type):
    temp = form_input(row_ID=df.id,
                      sentence=df.excerpt, 
                      target=df.target, 
                      data_type=data_type)
    
    return temp
    

def get_data_loader(class_input, batch_size, shuffle):
    temp_data_loader = DataLoader(class_input, 
                                  batch_size=batch_size,
                                  shuffle=shuffle
                                 )
    
    return temp_data_loader

In [11]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


In [12]:
criterion = nn.MSELoss()

In [13]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:388]    
    attention_parameters = named_parameters[391:395]
    regressor_parameters = named_parameters[395:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 2e-5

        if layer_num >= 133:
            lr = 2e-5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [14]:
def train_fn(train_data_loader
             , valid_data_loader
             , model
             , optimizer
             , fold_no
             , epoch
             , best_score
             , best_epoch
             , best_step 
             , scheduler=None):
    
    model.train()
    train_loss = 0
    eval_at_every = params_dict['eval_every_step']
    
    gc.collect()
    torch.cuda.empty_cache()

    for current_step, data in enumerate(train_data_loader):
        
        optimizer.zero_grad()
        
        batch_ids = data['ids'].to(config['device'], dtype = torch.long)
        batch_att_mask = data['att_mask'].to(config['device'], dtype = torch.long)
        batch_target = data['target'].to(config['device'], dtype = torch.float)
                
        output = model(ids=batch_ids, 
                       mask=batch_att_mask)
        batch_prediction = output.flatten()
        
        batch_train_loss = criterion(batch_prediction, batch_target)
        train_loss+=batch_train_loss.sum()
        
        batch_train_loss.sum().backward()
        optimizer.step()
        
        scheduler.step()
            
        # Deleting the intermediate variables
        del output, batch_prediction, batch_train_loss
        torch.cuda.empty_cache()
        gc.collect()
        
        if current_step%eval_at_every == 0:
            # Calculate train loss for each step
            interim_avg_train_loss = torch.sqrt(train_loss/(current_step+1))
        
            # Get the Eval results
            eval_loss, actual, predicted_output = eval_fn(data_loader=valid_data_loader,
                                                          model=model)
            
            # Get the actual and predicted
            actual = actual.detach().cpu().numpy()
            predicted_output = predicted_output.detach().cpu().numpy()

            # Calculate Eval RMSE
            eval_rmse = np.sqrt(mean_squared_error(predicted_output, actual))

            print(f"Fold:{fold_no}/{params_dict['total_folds']} Epoch:{epoch}/{params_dict['epoch']} Step: {current_step}/{len(train_data_loader)}, Train_loss: {interim_avg_train_loss :0.4f}, Eval_loss:{eval_loss:0.4f}, Eval RMSE:{eval_rmse:0.4f}")
            
            if eval_rmse<best_score:
                print(f"Eval RMSE improved from {best_score:0.4f} to {eval_rmse:0.4f}")
                best_score = eval_rmse
                best_epoch = epoch
                best_fold = fold_no
                best_step = current_step
                
                # Saving the model
                model_name = f"{config['model_dir']}fold_{fold_no}_{config['model_suffix']}"
                print(f"Saving the model {model_name}")        
                torch.save(model.state_dict(), model_name)
            else:
                print(f"Eval RMSE did not improve from the {best_score:0.4f} from epoch:{best_epoch} step:{best_step}")

            print("")
                
    avg_train_loss = torch.sqrt(train_loss/len(train_data_loader))
    
    # Deleting the intermediate variables
    del train_loss
    gc.collect()
    torch.cuda.empty_cache()
                  
    return avg_train_loss, best_score, best_epoch, best_step
        
    
def eval_fn(data_loader, model):
    
    model.eval()
    actual = torch.tensor([]).to(config['device'])
    predicted_output = torch.tensor([]).to(config['device'])
    
    eval_loss = 0
    with torch.no_grad():
        for i, data in enumerate(data_loader):
            batch_ids = data['ids'].to(config['device'])
            batch_att_mask = data['att_mask'].to(config['device'])
            batch_target = data['target'].to(config['device'])
            
            output = model(ids=batch_ids, 
                           mask=batch_att_mask)
            batch_prediction = output.flatten()
            
            batch_eval_loss = criterion(batch_prediction, batch_target)
            eval_loss+=batch_eval_loss.sum()
            
            actual = torch.hstack([actual, batch_target])
            predicted_output = torch.hstack([predicted_output,  batch_prediction])
            
    avg_eval_loss = torch.sqrt(eval_loss/len(data_loader))
            
    return avg_eval_loss, actual, predicted_output

In [15]:
def train_engine(train_data_loader, eval_data_loader, fold_no):
        
    gc.collect()
    torch.cuda.empty_cache()
    
    seed_everything(seed=100)
    model = MeanPoolingModel()
#     model = nn.DataParallel(model)
    model.to(config['device'])
    
#     optimizer = create_optimizer(model)     
    optimizer = AdamW(model.parameters(), lr=params_dict['learning_rate'], eps = params_dict['EPS'])
    
#     scheduler = get_cosine_schedule_with_warmup(optimizer,
#                                                 num_training_steps=params_dict['epoch'] * len(train_data_loader),
#                                                 num_warmup_steps=50)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0, 
                                                num_training_steps=len(train_data_loader) * params_dict['epoch'])
        
    best_score = 100000
    best_epoch=0
    best_step=0
    for epoch in range(1, (params_dict['epoch']+1)):
        train_loss, best_score, best_epoch, best_step = train_fn(train_data_loader=train_data_loader,
                                                                 valid_data_loader=eval_data_loader,
                                                                 model=model,
                                                                 optimizer=optimizer,
                                                                 epoch=epoch,
                                                                 fold_no=fold_no,
                                                                 best_score=best_score,
                                                                 best_epoch=best_epoch,
                                                                 best_step=best_step, 
                                                                 scheduler=scheduler)
            
        print(f"--------------------------------------------------------------------------------")
        print(f"----------------------Fold: {fold_no}, Epoch: {epoch} over----------------------")
        print(f"--------------------------------------------------------------------------------")
        print("")
    
    # Deleting the model and clearing the CUDA memory
    del model
    gc.collect()
    torch.cuda.empty_cache()
        
    return  best_score, best_epoch, best_step

In [16]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [17]:
use_fold = 'tb_fold'

total_fold = train_df[use_fold].nunique()

params_dict = {'Max_length':256,
               'train_batch_size':4,
               'valid_batch_size':128,
               'learning_rate':4e-5,
               'EPS':3e-8, 
               'weight_dict':0, 
               'opt': 'ADAMW', # MADGRAD, ADAM, ADAMW
               'scheduler':False,
               'epoch':3,
               'eval_every_step':20,
               'total_folds':total_fold
              }

class ContinuousStratifiedKFold(StratifiedKFold):
    def split(selfself, x, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(x, bins, groups)
    
gc.collect()

folds_best = {}
SEED = 42

NUM_FOLDS = 5
# kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
# for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    

kfold = ContinuousStratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df, train_df.target)):    

    fold = fold+1
    
    print(f"\nFold {fold}/{NUM_FOLDS}")
    model_path = f"model_large_{fold}.pth"
        
    set_random_seed(SEED + fold)
    
    train_fold = train_df.loc[train_indices].reset_index(drop=True)
    validation_fold = train_df.loc[val_indices].reset_index(drop=True)
    
    
# for i in range(1, total_fold+1):
    
#     fold_rmse = []
#     train_fold = train_df[train_df[use_fold]!=i].reset_index(drop=True)
#     validation_fold = train_df[train_df[use_fold]==i].reset_index(drop=True)
    
    print(f"Fold {fold}/{total_fold}: Train fold: {train_fold[use_fold].unique()}, Test fold:{validation_fold[use_fold].unique()}")

    train_fold_input = get_input(df=train_fold, data_type='train')
    validation_fold_input = get_input(df=validation_fold, data_type='train')

    train_data_loader = get_data_loader(class_input=train_fold_input, 
                                        batch_size=params_dict['train_batch_size'], 
                                        shuffle=True)
    valid_data_loader = get_data_loader(class_input=validation_fold_input, 
                                        batch_size=params_dict['valid_batch_size'], 
                                        shuffle=False)


    best_score, best_epoch, best_step = train_engine(train_data_loader=train_data_loader, 
                                                     eval_data_loader=valid_data_loader, 
                                                     fold_no=fold)
    
    folds_best[f'fold_{fold}'] = {}
    folds_best[f'fold_{fold}'] = {'best_epoch': best_epoch, 
                                  'best_step':best_step,
                                  'best_score':best_score
                              }
    
    print(f"Fold {fold}/{total_fold} Best Eval RMSE:{best_score}, Best epoch:{best_epoch}, Best step:{best_step}")
    print("")
    print("#######################################################################")
    
print("Fold summary")
print(folds_best)


Fold 1/5
Fold 1/5: Train fold: [3 2 5 4 1], Test fold:[4 3 5 2 1]


Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold:1/5 Epoch:1/3 Step: 0/567, Train_loss: 1.5625, Eval_loss:1.0884, Eval RMSE:1.0953
Eval RMSE improved from 100000.0000 to 1.0953
Saving the model ./fold_1_Roberta_large_model.bin

Fold:1/5 Epoch:1/3 Step: 20/567, Train_loss: 0.9596, Eval_loss:0.9698, Eval RMSE:0.9868
Eval RMSE improved from 1.0953 to 0.9868
Saving the model ./fold_1_Roberta_large_model.bin

Fold:1/5 Epoch:1/3 Step: 40/567, Train_loss: 0.9023, Eval_loss:0.6711, Eval RMSE:0.6768
Eval RMSE improved from 0.9868 to 0.6768
Saving the model ./fold_1_Roberta_large_model.bin

Fold:1/5 Epoch:1/3 Step: 60/567, Train_loss: 0.8825, Eval_loss:0.7728, Eval RMSE:0.7892
Eval RMSE did not improve from the 0.6768 from epoch:1 step:40

Fold:1/5 Epoch:1/3 Step: 80/567, Train_loss: 0.8405, Eval_loss:0.7360, Eval RMSE:0.7389
Eval RMSE did not improve from the 0.6768 from epoch:1 step:40

Fold:1/5 Epoch:1/3 Step: 100/567, Train_loss: 0.8417, Eval_loss:0.7053, Eval RMSE:0.7161
Eval RMSE did not improve from the 0.6768 from epoch:1 step:40


Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold:2/5 Epoch:1/3 Step: 0/567, Train_loss: 1.2697, Eval_loss:1.2066, Eval RMSE:1.2116
Eval RMSE improved from 100000.0000 to 1.2116
Saving the model ./fold_2_Roberta_large_model.bin

Fold:2/5 Epoch:1/3 Step: 20/567, Train_loss: 1.1608, Eval_loss:0.9102, Eval RMSE:0.9142
Eval RMSE improved from 1.2116 to 0.9142
Saving the model ./fold_2_Roberta_large_model.bin

Fold:2/5 Epoch:1/3 Step: 40/567, Train_loss: 1.0180, Eval_loss:0.7519, Eval RMSE:0.7617
Eval RMSE improved from 0.9142 to 0.7617
Saving the model ./fold_2_Roberta_large_model.bin

Fold:2/5 Epoch:1/3 Step: 60/567, Train_loss: 0.9568, Eval_loss:0.7837, Eval RMSE:0.7901
Eval RMSE did not improve from the 0.7617 from epoch:1 step:40

Fold:2/5 Epoch:1/3 Step: 80/567, Train_loss: 0.9364, Eval_loss:1.2586, Eval RMSE:1.2607
Eval RMSE did not improve from the 0.7617 from epoch:1 step:40

Fold:2/5 Epoch:1/3 Step: 100/567, Train_loss: 0.9019, Eval_loss:0.8209, Eval RMSE:0.8181
Eval RMSE did not improve from the 0.7617 from epoch:1 step:40


Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold:3/5 Epoch:1/3 Step: 0/567, Train_loss: 1.2877, Eval_loss:1.0146, Eval RMSE:1.0274
Eval RMSE improved from 100000.0000 to 1.0274
Saving the model ./fold_3_Roberta_large_model.bin

Fold:3/5 Epoch:1/3 Step: 20/567, Train_loss: 0.9611, Eval_loss:0.8671, Eval RMSE:0.8718
Eval RMSE improved from 1.0274 to 0.8718
Saving the model ./fold_3_Roberta_large_model.bin

Fold:3/5 Epoch:1/3 Step: 40/567, Train_loss: 0.8461, Eval_loss:0.9577, Eval RMSE:0.9614
Eval RMSE did not improve from the 0.8718 from epoch:1 step:20

Fold:3/5 Epoch:1/3 Step: 60/567, Train_loss: 0.8254, Eval_loss:0.7300, Eval RMSE:0.7403
Eval RMSE improved from 0.8718 to 0.7403
Saving the model ./fold_3_Roberta_large_model.bin

Fold:3/5 Epoch:1/3 Step: 80/567, Train_loss: 0.8450, Eval_loss:0.7842, Eval RMSE:0.7952
Eval RMSE did not improve from the 0.7403 from epoch:1 step:60

Fold:3/5 Epoch:1/3 Step: 100/567, Train_loss: 0.8381, Eval_loss:0.7064, Eval RMSE:0.7079
Eval RMSE improved from 0.7403 to 0.7079
Saving the model ./fol

Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold:4/5 Epoch:1/3 Step: 0/567, Train_loss: 0.7176, Eval_loss:1.0510, Eval RMSE:1.0498
Eval RMSE improved from 100000.0000 to 1.0498
Saving the model ./fold_4_Roberta_large_model.bin

Fold:4/5 Epoch:1/3 Step: 20/567, Train_loss: 1.0426, Eval_loss:0.9794, Eval RMSE:0.9776
Eval RMSE improved from 1.0498 to 0.9776
Saving the model ./fold_4_Roberta_large_model.bin

Fold:4/5 Epoch:1/3 Step: 40/567, Train_loss: 0.9771, Eval_loss:0.8713, Eval RMSE:0.8715
Eval RMSE improved from 0.9776 to 0.8715
Saving the model ./fold_4_Roberta_large_model.bin

Fold:4/5 Epoch:1/3 Step: 60/567, Train_loss: 0.9322, Eval_loss:0.8804, Eval RMSE:0.8896
Eval RMSE did not improve from the 0.8715 from epoch:1 step:40

Fold:4/5 Epoch:1/3 Step: 80/567, Train_loss: 0.9095, Eval_loss:0.7822, Eval RMSE:0.7703
Eval RMSE improved from 0.8715 to 0.7703
Saving the model ./fold_4_Roberta_large_model.bin

Fold:4/5 Epoch:1/3 Step: 100/567, Train_loss: 0.8896, Eval_loss:0.7548, Eval RMSE:0.7436
Eval RMSE improved from 0.7703 to 0

Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold:5/5 Epoch:1/3 Step: 0/567, Train_loss: 1.3555, Eval_loss:1.2057, Eval RMSE:1.1944
Eval RMSE improved from 100000.0000 to 1.1944
Saving the model ./fold_5_Roberta_large_model.bin

Fold:5/5 Epoch:1/3 Step: 20/567, Train_loss: 1.0528, Eval_loss:0.8392, Eval RMSE:0.8359
Eval RMSE improved from 1.1944 to 0.8359
Saving the model ./fold_5_Roberta_large_model.bin

Fold:5/5 Epoch:1/3 Step: 40/567, Train_loss: 0.9655, Eval_loss:0.8425, Eval RMSE:0.8378
Eval RMSE did not improve from the 0.8359 from epoch:1 step:20

Fold:5/5 Epoch:1/3 Step: 60/567, Train_loss: 0.9335, Eval_loss:0.7648, Eval RMSE:0.7648
Eval RMSE improved from 0.8359 to 0.7648
Saving the model ./fold_5_Roberta_large_model.bin

Fold:5/5 Epoch:1/3 Step: 80/567, Train_loss: 0.8872, Eval_loss:0.7462, Eval RMSE:0.7397
Eval RMSE improved from 0.7648 to 0.7397
Saving the model ./fold_5_Roberta_large_model.bin

Fold:5/5 Epoch:1/3 Step: 100/567, Train_loss: 0.8510, Eval_loss:0.8441, Eval RMSE:0.8434
Eval RMSE did not improve from the 

In [18]:
folds_best

{'fold_1': {'best_epoch': 1, 'best_step': 200, 'best_score': 0.5923193},
 'fold_2': {'best_epoch': 3, 'best_step': 160, 'best_score': 0.48741987},
 'fold_3': {'best_epoch': 1, 'best_step': 300, 'best_score': 0.5497123},
 'fold_4': {'best_epoch': 3, 'best_step': 480, 'best_score': 0.49744383},
 'fold_5': {'best_epoch': 3, 'best_step': 500, 'best_score': 0.5025306}}

In [19]:
folds_best

{'fold_1': {'best_epoch': 1, 'best_step': 200, 'best_score': 0.5923193},
 'fold_2': {'best_epoch': 3, 'best_step': 160, 'best_score': 0.48741987},
 'fold_3': {'best_epoch': 1, 'best_step': 300, 'best_score': 0.5497123},
 'fold_4': {'best_epoch': 3, 'best_step': 480, 'best_score': 0.49744383},
 'fold_5': {'best_epoch': 3, 'best_step': 500, 'best_score': 0.5025306}}