# LMSYS Keywords torch RoBERTa for Submission
with internet-off condition


- https://www.kaggle.com/code/stpeteishii/lmsys-prompt-response-words-keybert <br/>
train data processing

- https://www.kaggle.com/code/stpeteishii/lmsys-keywords-torch-roberta <br/>
model training using processed train data

- https://www.kaggle.com/code/stpeteishii/download-keybert <br/>
download keybert

- https://www.kaggle.com/code/stpeteishii/save-distilbert-base-nli-mean-tokens <br/>
download distilbert-base-nli-mean-tokens

- https://www.kaggle.com/code/stpeteishii/lmsys-keywords-torch-roberta-for-submission <br/>
test data processing, inferance (this notebook)

In [None]:
!pip install keybert --no-index --find-links=file:///kaggle/input/download-keybert

In [None]:
from keybert import KeyBERT

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt 
import transformers
import random
import warnings
warnings.simplefilter('ignore')
scaler = torch.cuda.amp.GradScaler() 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
SEED = 508
random_seed(SEED)

# Process Test Data

In [None]:
from sentence_transformers import SentenceTransformer

local_model = SentenceTransformer('/kaggle/input/save-distilbert-base-nli-mean-tokens')
modelky = KeyBERT(model=local_model)

In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')#, encoding='iso-8859-1')
test['prompt_kw']='-'
test['res_a_kw']='-'
test['res_b_kw']='-'

tkw0 = modelky.extract_keywords(test['prompt'],top_n=5)
tkw1 = modelky.extract_keywords(test['response_a'],top_n=10)
tkw2 = modelky.extract_keywords(test['response_b'],top_n=10)

In [None]:
for i,w in enumerate(tkw0): 
    ws=[]
    for wi in w:
        if '_' not in wi[0]:
            ws+=[wi[0]]
    test.loc[i,'prompt_kw']=' '.join(ws)
    
for i,w in enumerate(tkw1): 
    ws=[]
    for wi in w:
        if '_' not in wi[0]:
            ws+=[wi[0]]
    test.loc[i,'res_a_kw']=' '.join(ws)  
    
for i,w in enumerate(tkw2): 
    ws=[]
    for wi in w:
        if '_' not in wi[0]:
            ws+=[wi[0]]
    test.loc[i,'res_b_kw']=' '.join(ws)   

test['res_a_kw']=test['res_a_kw']+' // '+test['prompt_kw']
test['res_b_kw']=test['res_b_kw']+' // '+test['prompt_kw']
test=test.iloc[:,4:]
display(test)

#test.to_csv('test_key.csv',index=False)

In [None]:
testA=test[['res_a_kw']]
testA['label']=0
testA.columns=['text','label']
testB=test[['res_b_kw']]
testB['label']=0
testB.columns=['text','label']
TEST=pd.concat([testA,testB],axis=0)

In [None]:
max_sens = 8
p_test=TEST.reset_index(drop=True)

In [None]:
class BERTDataSet(Dataset):
    
    def __init__(self,sentences,targets):        
        self.sentences = sentences
        self.targets = targets
        
    def __len__(self):        
        return len(self.sentences)
    
    def __getitem__(self,idx):        
        sentence = self.sentences[idx]    
        bert_sens = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True, 
                                max_length = max_sens, 
                                pad_to_max_length = True, 
                                return_attention_mask = True)

        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)

        target = torch.tensor(self.targets[idx],dtype=torch.float)
        
        return {
                'ids': ids,
                'mask': mask,

                'targets': target
            }

In [None]:
test_dataset = BERTDataSet(p_test["text"],p_test["label"])
test_batch = 32
test_dataloader = DataLoader(test_dataset,batch_size=test_batch,shuffle = False,num_workers=8,pin_memory=True)

# def predicting
use saved models

In [None]:
#model initialized
tokenizer = transformers.RobertaTokenizer.from_pretrained("/kaggle/input/roberta-base")
model = transformers.RobertaForSequenceClassification.from_pretrained("/kaggle/input/roberta-base",num_labels=1)
pths = [os.path.join("/kaggle/input/lmsys-keywords-torch-roberta",s) for s in os.listdir("/kaggle/input/lmsys-keywords-torch-roberta") if ".pth" in s]
print(pths)

In [None]:
def predicting(
    test_dataloader,
    model,
    pths 
):
    allpreds = []    
    for pth in pths:  
        state = torch.load(pth, map_location=torch.device('cpu'))      
        model.load_state_dict(state["state_dict"])
        model.to(device)
        model.eval()      
        preds = []
        allvalloss=0

        with torch.no_grad():
            for a in test_dataloader:
                ids = a["ids"].to(device)
                mask = a["mask"].to(device)
                output = model(ids,mask)
                output = output["logits"].squeeze(-1)
                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)           
            allpreds.append(preds)

    return allpreds

In [None]:
tpreds = predicting(test_dataloader,model,pths)

# Prediction Result

In [None]:
test_pred = []
for p in tpreds[0]:
    test_pred+=[p]

In [None]:
submit=pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')
submit['winner_model_a']=test_pred[0:len(test)]
submit['winner_model_b']=test_pred[len(test):]
pa=submit['winner_model_a']
pb=submit['winner_model_b']
submit['winner_tie']=np.clip((pa+pb),0,1)
display(submit)
submit.to_csv('submission.csv',index=False)