In [8]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import numpy as np
import glob
import os

In [4]:
import pickle

# pkl 파일에서 데이터셋 로드
with open('../datasets2.pkl', 'rb') as f:
    datasets = pickle.load(f)

In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
for i in range(len(datasets)):
    datasets[i].label = np.float32(datasets[i].label)

In [10]:
from sklearn.model_selection import train_test_split

# 데이터셋을 train과 나머지로 나눔
train_data, temp_data = train_test_split(datasets, test_size=0.2, random_state=42)

# 나머지를 test와 eval로 나눔 (전체 데이터셋의 20%를 각각의 test와 eval로 사용)
test_data, eval_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [11]:
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Train Dataloader
train_dataloader = DataLoader(
    train_data,
    shuffle=True,
    batch_size=32,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    eval_data,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    test_data,
    name="sts-test",
)


In [12]:
from sentence_transformers import SentenceTransformer, models

# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path="deepset/roberta-base-squad2",
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)
model.to(device)

# warmup steps
warmup_steps = math.ceil(len(datasets) * 4 / 32 * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=4,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps
)

In [None]:
def embed_processes_with_sbert(tokenized_processes):
    
    # 임베딩을 저장할 리스트 초기화
    sbert_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        # 프로세스 트리를 텍스트로 펼치고 토큰화
        process_tree_text = " ".join(tree_tokens)

        # SBERT 모델을 사용하여 문장 임베딩 얻기
        sbert_embedding = model.encode([process_tree_text])

        # 평균 풀링
        average_pooled_embedding = np.mean(sbert_embedding, axis=0)
        sbert_embeddings.append(average_pooled_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 sbert_embeddings에 저장됨
    return sbert_embeddings

In [None]:
groups = []

SSL_sbert_embeddings_1 = []
SSL_sbert_embeddings_2 = []
SSL_sbert_embeddings_3 = []
SSL_sbert_embeddings_4 = []
SSL_sbert_embeddings_5 = []

for idx, group in enumerate(groups):
    sbert_embeddings = embed_processes_with_sbert(group)

    # 그룹별로 결과를 저장
    if idx == 0:
        SSL_sbert_embeddings_naive = sbert_embeddings
    elif idx == 1:
        SSL_sbert_embeddings_simple = sbert_embeddings
    elif idx == 2:
        SSL_sbert_embeddings_concise = sbert_embeddings
    elif idx == 3:
        SSL_sbert_embeddings_straightforward = sbert_embeddings
    elif idx == 4:
        SSL_sbert_embeddings_complex = sbert_embeddings
    elif idx == 5:
        SSL_sbert_embeddings_spaghetti = sbert_embeddings