**CROSS-ENCODER EXPERIMENT - SOTA SEARCH**

In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
! nvidia-smi

Sun Dec 11 06:38:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.88       Driver Version: 418.88       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro P4000        On   | 00000000:3B:00.0 Off |                  N/A |
| 46%   37C    P0    28W / 105W |     11MiB /  8119MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Quadro P4000        On   | 00000000:D8:00.0 Off |                  N/A |
| 46%   34C    P8     5W / 105W |    950MiB /  8119MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [None]:
! pip install sentence-transformers datasets

In [None]:
import numpy as np
import pandas as pd
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator, CEBinaryAccuracyEvaluator, CEBinaryClassificationEvaluator

from sentence_transformers.readers import InputExample




from tqdm.auto import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# 학습경과 모니터링하는 logger 초기화
from importlib import reload
reload(logging)

logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.DEBUG,
    handlers=[LoggingHandler()],
)

In [None]:
logging.info('This is an info message')

In [None]:
# 시드고정 
import torch
import random
import torch.backends.cudnn as cudnn


seed = 1

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)

### Load Model

cross-encoder

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder("klue/roberta-base", num_labels=1)
num_epochs=3

In [None]:
train_batch_size = 32
model_save_path = "/home/sol3sts/teacher_model_" + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

### Load Data (KLUE-STS)

### 기존 & 새로운 데이터 동시 로드 / val 동일하게 통일 

In [None]:
train = pd.read_csv('/home/sol3sts/dataset/1209_train_dev&val제거.csv')
val = pd.read_csv('/home/sol3sts/dataset/1209_val.csv')
eval = pd.read_csv('/home/sol3sts/dataset/new_devset.csv')

In [None]:
len(train),len(val),len(eval)

In [None]:
eval= eval[['sentence1','sentence2','new_label']]
test = eval.rename(columns={'new_label':'labels.label'})


### 새로운 데이터 (new_label)로 데이터 로드 

In [None]:
# 새로운 데이터로 학습 
train = train.reset_index()
train = train[['sentence1','sentence2','new_label']]
train = train.rename(columns={'new_label':'labels.label'})


val = val.reset_index()
val = val[['sentence1','sentence2','new_label']]
val = val.rename(columns={'new_label':'labels.label'})

print(len(train))
logging.info("새로운 데이터 로드 ")

In [None]:
len(train),len(val),len(test)

In [None]:
aug_korsts = pd.read_csv('/home/sol3sts/dataset/aug_korsts.csv')
aug_korsts = aug_korsts[['sentence1','sentence2','labels.label']]
aug_korsts

In [None]:
# data augmentation: korsts_1.8-3.2 trainset에추가 
train = pd.concat([train,aug_korsts])
train = train.reset_index()
len(train),len(val),len(test)

In [None]:
len(train[train['labels.label'] >=3.0]), len(train[train['labels.label'] <3.0]), len(train[train['labels.label'] ==0.0])

In [None]:
len(train[train['labels.label'] ==4.0]), len(train[train['labels.label'] ==3.5]),len(train[train['labels.label'] ==3.7])

In [None]:
len(train[train['labels.label'] ==0.2]),len(train[train['labels.label'] ==0.3])

In [None]:
train_zero = train[train['labels.label'] == 0].sample(n=2000,random_state=77)
train = train.drop(train_zero.index)

train_02 = train[train['labels.label'] == 0.2].sample(n=200,random_state=77)
train = train.drop(train_02.index)

train_03 = train[train['labels.label'] == 0.2].sample(n=200,random_state=77)
train = train.drop(train_03.index)

train_40 = train[train['labels.label'] == 4.0].sample(n=350,random_state=77)
train = train.drop(train_40.index)

train_30 = train[train['labels.label'] == 3.7].sample(n=300,random_state=77)
train = train.drop(train_30.index)



In [None]:
len(train),len(val),len(test)

### 모델 입력 형태로 바꿔주기 (공통) 

In [None]:
# 분포 확인 
fig, ax = plt.subplots(ncols=2,nrows=2, figsize=(50,20))


sns.countplot(x = val['labels.label'],ax=ax[0,0],color='#FFB6C1')
sns.countplot(x = test['labels.label'],ax=ax[0,1],color='#FFB6C1')
sns.countplot(x = train['labels.label'],ax=ax[1,0],color='#FFB6C1')

ax[0,0].set_title("validation data")
ax[0,1].set_title("test data")
ax[1,0].set_title('train_data')

plt.show()

In [None]:
len(train),len(val),len(test)

In [None]:
len(train)

In [None]:
train = train.reset_index()
train = train[['sentence1','sentence2','labels.label']]
val = val[['sentence1','sentence2','labels.label']]
test = test[['sentence1','sentence2','labels.label']]

In [None]:
# 모델에 입력형태로 바꿔주기 (train, val, test)

input_examples_train=[]
for i in tqdm(range(len(train))):
  sentence1 = train.values[i][0]
  sentence2 = train.values[i][1]
  score = train.values[i][2]/5.0 # 0-1사이로 정규화 
  input_examples_train.append(InputExample(texts=[sentence1, sentence2], label=score))

input_examples_val=[]
for i in tqdm(range(len(val))):
  sentence1 = val.values[i][0]
  sentence2 = val.values[i][1]
  score = val.values[i][2]/5.0
  input_examples_val.append(InputExample(texts=[sentence1, sentence2], label=score))

input_examples_test=[]
for i in tqdm(range(len(test))):
  sentence1 = test.values[i][0]
  sentence2 = test.values[i][1]
  score = test.values[i][2]/5.0
  input_examples_test.append(InputExample(texts=[sentence1, sentence2], label=score))

In [None]:
sts_train_examples = input_examples_train
sts_val_examples = input_examples_val
sts_test_examples = input_examples_test

In [None]:
sts_val_examples[0].texts, sts_val_examples[0].label

In [None]:
len(input_examples_train), len(input_examples_val), len(input_examples_test)

### Evaluator 정의 (cross encoder)

In [None]:
## CROSS-ENCODER ##
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator


# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size, 
)

# Evaluator CEcorrelationEvaluator
logging.info("Creating cross-encoder evaluators")
val_evaluator = CECorrelationEvaluator.from_input_examples(
    sts_val_examples,
    name="sts-cross_encoder-val",
)


#Evaluator by sts-test
test_evaluator = CECorrelationEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-cross_encoder-test",
)

### defining loss function / training

In [None]:
! nvidia-smi

In [None]:
# GPU 할당 변경하기
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU

print ('# Current cuda device: ', torch.cuda.current_device()) # check

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

In [None]:
## CROSS-ENCODER ## 


# linear learning-rate warmup steps
warmup_steps = math.ceil(len(train_dataloader) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps)) # 학습 로그 표시 



# Training
cross_encoder.fit(
    train_dataloader = train_dataloader,
    evaluator=val_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    optimizer_params = {'lr':5e-5},
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    show_progress_bar = True
)

In [None]:
model_save_path 

### cross-encoder-test_evaluating

In [None]:
# test 
cross_encoder = CrossEncoder(model_save_path)
test_evaluator(cross_encoder)

#### bin 별로 성능평가 

In [None]:
pairs_all = list(zip(test['sentence1'], test['sentence2']))

scores_all = cross_encoder.predict(pairs_all,
                               show_progress_bar = True)

In [None]:
test['silver_label'] = np.round((scores_all * 5).tolist(), 1)
# test['silver_label'] = (scores_all * 5).tolist()
test

In [None]:
print("'Spearman correlation' for whole test data:",test.corr(method='spearman')['silver_label'][0])
print("'Pearson correlation' for whole test data:",test.corr(method='pearson')['silver_label'][0])

In [None]:
## 1.8 - 3.2구간만 따로 테스트 
target_test = test[(test['labels.label'] >=1.8) & (test['labels.label'] <=3.2)]

print("'Spearman correlation' for whole test data:",target_test.corr(method='spearman')['silver_label'][0])
print("'Pearson correlation' for whole test data:",target_test.corr(method='pearson')['silver_label'][0])

In [None]:
correct=[]
for i in range(len(test)):
  if (test['silver_label'].values[i] >= 3.0) & (test['labels.label'].values[i] >= 3.0):
    correct.append('True_P')
  elif (test['silver_label'].values[i] < 3.0) & (test['labels.label'].values[i] < 3.0):
    correct.append('True_N')
  elif (test['silver_label'].values[i] >= 3.0) & (test['labels.label'].values[i] < 3.0):
    correct.append('False_P')
  else:
    correct.append('False_N')



In [None]:
test['correctness'] = correct

print("whole test data confusion matrix")
print()
print(test['correctness'].value_counts())

In [None]:
# acc / f1 
TP = len(test[test['correctness'] == 'True_P'])
TN = len(test[test['correctness'] == 'True_N'])
FP = len(test[test['correctness'] == 'False_P'])
FN = len(test[test['correctness'] == 'False_N'])

print("precision", TP/(TP+FP))
print("recall", TP/(TP+FN))
pc=TP/(TP+FP)
rc=TP/(TP+FN)
print("f1 score", 2*pc*rc/(pc+rc))

In [None]:
##### # acc / f1 
# TP = 
# TN = 
# FP = 
# FN = 

# print("precision", TP/(TP+FP))
# print("recall", TP/(TP+FN))
# pc=TP/(TP+FP)
# rc=TP/(TP+FN)
# print("f1 score", 2*pc*rc/(pc+rc))

0.5단위 bin 만들기

In [None]:
# train
t = []
for i in range(len(train)):
  if (train['labels.label'].values[i] >= 0.0)  & (train['labels.label'].values[i] <=0.2):
    t.append(0.0)
  elif (train['labels.label'].values[i] >= 0.2)  & (train['labels.label'].values[i] <=0.7):
    t.append(0.5)  
  elif (train['labels.label'].values[i] >= 0.8)  & (train['labels.label'].values[i] <=1.2):
    t.append(1.0)
  elif (train['labels.label'].values[i] >= 1.3)  & (train['labels.label'].values[i] <=1.7):
    t.append(1.5)  
  elif (train['labels.label'].values[i] >= 1.8)  & (train['labels.label'].values[i] <=2.2):
    t.append(2.0)
  elif (train['labels.label'].values[i] >= 2.3)  & (train['labels.label'].values[i] <=2.7):
    t.append(2.5)  
  elif (train['labels.label'].values[i] >= 2.8)  & (train['labels.label'].values[i] <=3.2):
    t.append(3.0)
  elif (train['labels.label'].values[i] >= 3.3)  & (train['labels.label'].values[i] <=3.7):
    t.append(3.5)  
  elif (train['labels.label'].values[i] >= 3.8)  & (train['labels.label'].values[i] <=4.2):
    t.append(4.0)
  elif (train['labels.label'].values[i] >= 4.3)  & (train['labels.label'].values[i] <=4.7):
    t.append(4.5)  
  elif (train['labels.label'].values[i] >= 4.8)  & (train['labels.label'].values[i] <=5.0):
    t.append(5.0)


print(len(t))

train['gold_bin'] = t


train_bin_05 = train[train['gold_bin'] == 0.5]
train_bin_10 = train[train['gold_bin'] == 1.0]
train_bin_15 = train[train['gold_bin'] == 1.5]
train_bin_20 = train[train['gold_bin'] == 2.0]
train_bin_25 = train[train['gold_bin'] == 2.5]
train_bin_30 = train[train['gold_bin'] == 3.0]
train_bin_35 = train[train['gold_bin'] == 3.5]
train_bin_40 = train[train['gold_bin'] == 4.0]
train_bin_45 = train[train['gold_bin'] == 4.5]
train_bin_50 = train[train['gold_bin'] == 5.0]



In [None]:
# test
a = []
for i in range(len(test)):
  if (test['labels.label'].values[i] >= 0.0)  & (test['labels.label'].values[i] <=0.2):
    a.append(0.0)
  elif (test['labels.label'].values[i] >= 0.2)  & (test['labels.label'].values[i] <=0.7):
    a.append(0.5)  
  elif (test['labels.label'].values[i] >= 0.8)  & (test['labels.label'].values[i] <=1.2):
    a.append(1.0)
  elif (test['labels.label'].values[i] >= 1.3)  & (test['labels.label'].values[i] <=1.7):
    a.append(1.5)  
  elif (test['labels.label'].values[i] >= 1.8)  & (test['labels.label'].values[i] <=2.2):
    a.append(2.0)
  elif (test['labels.label'].values[i] >= 2.3)  & (test['labels.label'].values[i] <=2.7):
    a.append(2.5)  
  elif (test['labels.label'].values[i] >= 2.8)  & (test['labels.label'].values[i] <=3.2):
    a.append(3.0)
  elif (test['labels.label'].values[i] >= 3.3)  & (test['labels.label'].values[i] <=3.7):
    a.append(3.5)  
  elif (test['labels.label'].values[i] >= 3.8)  & (test['labels.label'].values[i] <=4.2):
    a.append(4.0)
  elif (test['labels.label'].values[i] >= 4.3)  & (test['labels.label'].values[i] <=4.7):
    a.append(4.5)  
  elif (test['labels.label'].values[i] >= 4.8)  & (test['labels.label'].values[i] <=5.0):
    a.append(5.0)

b = []
for i in range(len(test)):
  if (test['silver_label'].values[i] >= 0.0)  & (test['silver_label'].values[i] <=0.2):
    b.append(0.0)
  elif (test['silver_label'].values[i] >= 0.3)  & (test['silver_label'].values[i] <=0.7):
    b.append(0.5)  
  elif (test['silver_label'].values[i] >= 0.8)  & (test['silver_label'].values[i] <=1.2):
    b.append(1.0)
  elif (test['silver_label'].values[i] >= 1.3)  & (test['silver_label'].values[i] <=1.7):
    b.append(1.5)  
  elif (test['silver_label'].values[i] >= 1.8)  & (test['silver_label'].values[i] <=2.2):
    b.append(2.0)
  elif (test['silver_label'].values[i] >= 2.3)  & (test['silver_label'].values[i] <=2.7):
    b.append(2.5)  
  elif (test['silver_label'].values[i] >= 2.8)  & (test['silver_label'].values[i] <=3.2):
    b.append(3.0)
  elif (test['silver_label'].values[i] >= 3.3)  & (test['silver_label'].values[i] <=3.7):
    b.append(3.5)  
  elif (test['silver_label'].values[i] >= 3.8)  & (test['silver_label'].values[i] <=4.2):
    b.append(4.0)
  elif (test['silver_label'].values[i] >= 4.3)  & (test['silver_label'].values[i] <=4.7):
    b.append(4.5)  
  elif (test['silver_label'].values[i] >= 4.8)  & (test['silver_label'].values[i] <=5.0):
    b.append(5.0)

len(a), len(b)

In [None]:

#test

test['differ']= np.round(test['labels.label'] - test['silver_label'],1)
test['gold_bin'] = a
test['silver_bin'] = b

test['same_bin']= (test['gold_bin'] == test['silver_bin'])

# bin간의 거리
test['bin_distance'] = test['gold_bin'] - test['silver_bin']


#MSE
test['SE'] = test['differ']**2 
print("MSE:",sum(test['SE']) / len(test)) 

# make bin
bin_05 = test[test['gold_bin'] == 0.5]
bin_10 = test[test['gold_bin'] == 1.0]
bin_15 = test[test['gold_bin'] == 1.5]
bin_20 = test[test['gold_bin'] == 2.0]
bin_25 = test[test['gold_bin'] == 2.5]
bin_30 = test[test['gold_bin'] == 3.0]
bin_35 = test[test['gold_bin'] == 3.5]
bin_40 = test[test['gold_bin'] == 4.0]
bin_45 = test[test['gold_bin'] == 4.5]
bin_50 = test[test['gold_bin'] == 5.0]

In [None]:
print("Test_Evaluation_Per_Bin")
print( "len = number of test_data,", "ME = mean_error")
print()
print("<bin_0.5>:","pearson:",bin_05.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_05.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_05), "MSE:",round(sum(bin_05['SE']) / len(bin_05),2),"ME:",round(sum(bin_05['differ']) / len(bin_05),2),"len(train):",len(train_bin_05))
print("<bin_1.0>:","pearson:",bin_10.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_10.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_10), "MSE:",round(sum(bin_10['SE']) / len(bin_10),2),"ME:",round(sum(bin_10['differ']) / len(bin_10),2),"len(train):",len(train_bin_10))
print("<bin_1.5>:","pearson:",bin_15.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_15.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_15), "MSE:",round(sum(bin_15['SE']) / len(bin_15),2),"ME:",round(sum(bin_15['differ']) / len(bin_15),2),"len(train):",len(train_bin_15))
print("<bin_2.0>:","pearson:",bin_20.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_20.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_20), "MSE:",round(sum(bin_20['SE']) / len(bin_20),2),"ME:",round(sum(bin_20['differ']) / len(bin_20),2),"len(train):",len(train_bin_20))
print("<bin_2.5>:","pearson:",bin_25.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_25.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_25), "MSE:",round(sum(bin_25['SE']) / len(bin_25),2),"ME:",round(sum(bin_25['differ']) / len(bin_25),2),"len(train):",len(train_bin_25))
print("<bin_3.0>:","pearson:",bin_30.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_30.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_30), "MSE:",round(sum(bin_30['SE']) / len(bin_30),2),"ME:",round(sum(bin_30['differ']) / len(bin_30),2),"len(train):",len(train_bin_30))
print("<bin_3.5>:","pearson:",bin_35.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_35.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_35), "MSE:",round(sum(bin_35['SE']) / len(bin_35),2),"ME:",round(sum(bin_35['differ']) / len(bin_35),2),"len(train):",len(train_bin_35))
print("<bin_4.0>:","pearson:",bin_40.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_40.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_40), "MSE:",round(sum(bin_40['SE']) / len(bin_40),2),"ME:",round(sum(bin_40['differ']) / len(bin_40),2),"len(train):",len(train_bin_40))
print("<bin_4.5>:","pearson:",bin_45.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_45.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_45), "MSE:",round(sum(bin_45['SE']) / len(bin_45),2),"ME:",round(sum(bin_45['differ']) / len(bin_45),2),"len(train):",len(train_bin_45))
print("<bin_5.0>:","pearson:",bin_50.corr(method='pearson')['silver_label'][0].round(2),"spaerman:",bin_50.corr(method='spearman')['silver_label'][0].round(2), "len:",len(bin_50), "MSE:",round(sum(bin_50['SE']) / len(bin_50),2),"ME:",round(sum(bin_50['differ']) / len(bin_50),2),"len(train):",len(train_bin_50))

In [None]:
# Confusion matrix

# acc / f1

def confusion_matrix(test):

  TP = len(test[test['correctness'] == 'True_P'])
  TN = len(test[test['correctness'] == 'True_N'])
  FP = len(test[test['correctness'] == 'False_P'])
  FN = len(test[test['correctness'] == 'False_N'])
  print("TP:",TP,"TN:",TN,"FP:",FP,"FN",FN)

  
  if (TP + FP) & (TP+FN) ==0:
    return("accuracy:",(TP+TN)/(TP+TN+FP+FN))
  
  else:
    pc=TP/(TP+FP)
    rc=TP/(TP+FN)
    
    return("accuracy:",(TP+TN)/(TP+TN+FP+FN),"precision:", TP/(TP+FP), "recall:", TP/(TP+FN),"f1 score:", 2*pc*rc/(pc+rc))

print("전체 test data")
print()
print("TP = 유사한페어를 유사하다고 정답,", "TN = 유사하지않은 페어를 유사하지 않다고 정답,", "FP=유사하지 않은페어를 유사하다고 오판,", "FN = 유사한페어를 유사하지않다고 오판")
print()
print("1.8-2.2:")
print(confusion_matrix(bin_20))
print()
print("2.3-2.7:")
print(confusion_matrix(bin_25))
print()
print("2.8-3.2:")
print(confusion_matrix(bin_30))
print()
print("3.3-3.7:")
print(confusion_matrix(bin_35))
print()
print("3.8-4.2:")
print(confusion_matrix(bin_40))
print()
print("4.3-4.7:")
print(confusion_matrix(bin_45))
print()
print("4.8-5.0:")
print(confusion_matrix(bin_50))

In [None]:
FP_1 = len(bin_20[bin_20['correctness'] == 'False_P'])
FN_1 = len(bin_20[bin_20['correctness'] == 'False_N'])
FP_2 = len(bin_25[bin_25['correctness'] == 'False_P'])
FN_2 = len(bin_25[bin_25['correctness'] == 'False_N'])
FP_3 = len(bin_30[bin_30['correctness'] == 'False_P'])
FN_3 = len(bin_30[bin_30['correctness'] == 'False_N'])

FP_1 + FP_2 + FP_3 + FN_1 + FN_2 + FN_3

### DATA AUGMENTATION

- 1.8 - 3.2구간의 데이터만 추가해주기 

In [None]:
korsts_train = pd.read_csv('https://huggingface.co/datasets/hongdijk/kor_nlu_hufs/raw/main/KorSTS/sts-train.csv')
korsts_test = pd.read_csv('https://huggingface.co/datasets/hongdijk/kor_nlu_hufs/raw/main/KorSTS/sts-test.csv')
korsts_val = pd.read_csv('https://huggingface.co/datasets/hongdijk/kor_nlu_hufs/raw/main/KorSTS/sts-dev.csv')

In [None]:
len(korsts_train),len(korsts_test),len(korsts_val)

In [None]:
korsts_train

In [None]:
korsts_test = korsts_test.dropna()
korsts_test.info()
korsts_val = korsts_val.dropna()
korsts_val.info()
korsts_train = korsts_train.dropna()
korsts_train.info()

In [None]:
korsts_all = pd.concat([korsts_train,korsts_val,korsts_test])
korsts_all = korsts_all.reset_index()
korsts_all = korsts_all[['sentence1','sentence2']]
korsts_all

In [None]:
# unlabled data prediction with best model
pairs_korsts = list(zip(korsts_all['sentence1'], korsts_all['sentence2']))

scores_korsts = cross_encoder.predict(pairs_korsts, show_progress_bar=True)

In [None]:
#임시 -> korsts전체 klue로 학습한 모델로 silver_label해주기
korsts_all['labels.label'] = np.round((scores_korsts*5).tolist(),1)
korsts_all

In [None]:
# 분포 확인하기 
fig, ax = plt.subplots(ncols=1,nrows=1, figsize=(50,20))


sns.countplot(x = korsts_all['labels.label'],color='#FFB6C1')


plt.show()

In [None]:
# 1.8 - 3.2데이터만 추출 
aug_korsts = korsts_all[(korsts_all['labels.label'] >= 1.8) & (korsts_all['labels.label'] <= 3.2)]
aug_korsts

In [None]:
aug_korsts.to_csv('/home/sol3sts/dataset/aug_korsts.csv')