# Setting

In [16]:
import math
import logging
import pickle
import pm4py
import re
import random
from itertools import chain

import torch
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from torch.utils.data import DataLoader, random_split
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader, Dataset
import os
from transformers import AutoTokenizer, AutoModel
from torch.optim import Adam
import itertools


import matplotlib.pyplot as plt
from zss import simple_distance, distance, Node
from tqdm.auto import tqdm


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
def load_process_trees(filenames):
    process_trees_list = []
    for filename in filenames:
        path = f'datasets/firstcase_10k_trees/{filename}'
        with open(path, 'rb') as file:
            process_trees = pickle.load(file)
        process_trees_list.append(process_trees)
    return process_trees_list

In [19]:
# 파일 이름 리스트
filenames = [
    'processtree_naive.pkl',
    'processtree_simple.pkl',
    'processtree_concise.pkl',
    'processtree_straightforward.pkl',
    'processtree_complex.pkl',
    'processtree_spaghetti.pkl'
]
chunk_size = 10000
# 파일에서 불러오기
processtree_list = load_process_trees(filenames)

In [20]:
def tokenize_tree(tree):
    # 특수 문자를 이스케이프하여 정규 표현식 적용 + 괄호는 넣기
    special_characters = r"[\*\+\./:;<=\?\[\]\^_`{|}~\(\)]"
    tokenized_tree = re.findall(r"\b\w+\b|" + special_characters + r"|\u2192", str(tree).replace("->", "\u2192"))
    return tokenized_tree

def tokenize_tree_v2(tree):
    special_characters = r"[\*\+\./:;<=\?\[\]\^_`{|}~\(\)]"
    tree = ''.join('_' if (char.isalpha() and char != 'X') else char for char in str(tree))
    tokenized_tree = re.findall(r"\b\w+\b|" + special_characters + r"|\u2192", tree.replace("->", "\u2192"))
    
    return tokenized_tree

In [21]:
class TreeNode:
    def __init__(self, value):
        self.value = value
        self.children = []

    def add_child(self, child):
        self.children.append(child)

    def print_tree(self, depth=0):
        print("  " * depth + self.value)
        for child in self.children:
            child.print_tree(depth + 1)

    def to_zss_node(self):
        if not self.children:
            return Node(self.value, [])
        else:
            children_nodes = [child.to_zss_node() for child in self.children]
            return Node(self.value, children_nodes)

    def count_nodes(self):
        # 현재 노드를 포함하여 노드 수 계산
        count = 1
        for child in self.children:
            count += child.count_nodes()
        return count

def text_to_tree(sentence):
    stack = []
    root = TreeNode(None)
    sentence = sentence.replace(" ", "")
    for i in range(len(sentence)):
        if sentence[i] != '(' and sentence[i] != ')':
            cur = sentence[i]
            child = TreeNode(cur)
            root.add_child(child)
        elif sentence[i] == '(':
            stack.append(child)
            root = child
        else:
            stack.pop()
            if stack != []:
                root = stack[-1]
    # tree = root.to_zss_node()
    return root

def tree_edit_distance(graph1, graph2):
    tree1 = graph1.to_zss_node()
    tree2 = graph2.to_zss_node()
    dist = simple_distance(tree1, tree2)
    return dist

def max_edit_distance(graph1, graph2):
    max1 = graph1.count_nodes()
    max2 = graph2.count_nodes()
    dist = max1 + max2
    return dist

def graph_edit_similarity(sentence1, sentence2):
    graph1 = text_to_tree(sentence1)
    graph2 = text_to_tree(sentence2)

    ged = tree_edit_distance(graph1, graph2)
    maxged = max_edit_distance(graph1, graph2)

    sim = 1 - (ged / maxged)

    return sim

def max_common_subtree(graph1, graph2):
    
    if graph1.value == graph2.value:
        common_size = 1
        
        if graph1.value == '→' and graph2.value == '→':
            children1 = graph1.children
            children2 = graph2.children
        else:
            children1 = sorted(graph1.children, key=lambda x: x.value)
            children2 = sorted(graph2.children, key=lambda x: x.value)

        matched_children2 = set()  # 이미 매칭된 자식 노드 추적
        for child1 in children1:
            best_match_size = 0  # 자식 간 최대 매칭 크기 저장
            best_match_child2 = None

            for child2 in children2:
                if child2 not in matched_children2:
                    match_size = max_common_subtree(child1, child2)
                    if match_size > best_match_size:
                        best_match_size = match_size
                        best_match_child2 = child2
            
            if best_match_child2:
                common_size += best_match_size
                matched_children2.add(best_match_child2)
                
        return common_size
    
    max_size = 0
    for child1 in graph1.children:
        for child2 in graph2.children:
            max_size = max(max_size, max_common_subtree(child1, child2))
    
    return max_size

def mcs_similarity(sentence1, sentence2):
    graph1 = text_to_tree(sentence1)
    graph2 = text_to_tree(sentence2)

    mcs = max_common_subtree(graph1, graph2)
    max_length = max(graph1.count_nodes(), graph2.count_nodes())

    # 분모가 0일 경우 예외 처리
    if max_length == 0:
        return 0.0
    sim = mcs / max_length
    
    return sim


In [22]:
def cosine_avg(embedded_group):
    num_groups = len(embedded_group)
    similarity_matrix = np.zeros((num_groups, num_groups))

    for i in range(num_groups):
        for j in range(i, num_groups):
            embeddings1 = embedded_group[i]
            embeddings2 = embedded_group[j]

            similarity_matrix[i, j] = np.mean(cosine_similarity(embeddings1, embeddings2))

    print("Similarity Matrix:")
    print(similarity_matrix)
    return similarity_matrix

In [23]:
def save_datasets(datasets, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(datasets, file)
    print(f"Datasets saved to {file_path}")

def load_datasets(file_path):
    with open(file_path, 'rb') as file:
        datasets = pickle.load(file)
    print(f"Datasets loaded from {file_path}")
    return datasets

In [24]:
tokenized_processtrees = [tokenize_tree(tree) for process_list in processtree_list for tree in process_list]
tokenized_groups = [tokenized_processtrees[i:i+chunk_size] for i in range(0, len(tokenized_processtrees), chunk_size)]
origin_groups = []
for group in tokenized_groups:
    group_trees = [" ".join(tree) for tree in group]
    origin_groups.append(group_trees)
all_process_trees = list(chain.from_iterable(origin_groups))

In [25]:
replaced_tokenized_processtrees = [tokenize_tree_v2(tree) for process_list in processtree_list for tree in process_list]
replaced_tokenized_groups = [replaced_tokenized_processtrees[i:i+chunk_size] for i in range(0, len(replaced_tokenized_processtrees), chunk_size)]
replaced_groups = []
for group in replaced_tokenized_groups:
    group_trees = [" ".join(tree) for tree in group]
    replaced_groups.append(group_trees)
all_replaced_trees = list(chain.from_iterable(replaced_groups))

In [26]:
# Define a simple dataset class
class TripletDataset(Dataset):
    def __init__(self, triplets):
        self.triplets = triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        anchor, positive, negative, ged_ap, ged_an = self.triplets[idx]
        return anchor, positive, negative, ged_ap, ged_an

# Define cosine distance
def cosine_distance(embedding1, embedding2):
    return 1 - nn.functional.cosine_similarity(embedding1, embedding2)

# Define the Triplet Loss with separate lambda weights for AP and AN
class HybridTripletLoss(nn.Module):
    def __init__(self, margin=0.2, lambda_ap=0.1, lambda_an=0.1):
        super(HybridTripletLoss, self).__init__()
        self.margin = margin
        self.lambda_ap = lambda_ap
        self.lambda_an = lambda_an

    def forward(self, anchor, positive, negative, ged_ap, ged_an):
        # Cosine distances
        d_ap = cosine_distance(anchor, positive)
        d_an = cosine_distance(anchor, negative)
        
        # Triplet loss (cosine distance-based)
        triplet_loss = torch.clamp(d_ap - d_an + self.margin, min=0.0)
        
        # GED-based regularization with separate weights
        ged_loss = (self.lambda_ap * torch.abs(d_ap - ged_ap)) + (self.lambda_an * torch.abs(d_an - ged_an))
        
        # Total loss
        return triplet_loss.mean() + ged_loss.mean()
    

In [27]:
def embedding_groups(groups, model, tokenizer, batch_size=32, pooling="cls"):
    result = []
    for i, group in enumerate(groups):
        result.append([])

        for start in range(0, len(group), batch_size):
            batch = group[start:start+batch_size]
            batch_texts = [" ".join(tree) for tree in batch]
            
            inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

            with torch.no_grad():
                outputs = model(**inputs)
                last_hidden_state = outputs.last_hidden_state

                if pooling == "cls":
                    batch_embeddings = last_hidden_state[:, 0, :]
                elif pooling == "mean":
                    batch_embeddings = last_hidden_state.mean(dim=1)
                elif pooling == "max":
                    batch_embeddings, _ = last_hidden_state.max(dim=1)

            result[i].extend(batch_embeddings.cpu().numpy())

    return result


In [28]:
def triplets_sampling(embedded_groups, all_pts):
    triplets = []

    for group_idx, group_embeddings in enumerate(embedded_groups):
        group_offset = group_idx * 10000

        for X_idx, X in enumerate(group_embeddings):
            similarities = cosine_similarity([X], group_embeddings)[0]
            
            filtered_similarities = [(sim, idx) for idx, sim in enumerate(similarities) if sim < 0.9999]
            local_Y_idx = max(filtered_similarities, key=lambda x: x[0])[1] 
            Y_idx = group_offset + local_Y_idx

            least_similar_idxs = []
            for other_group_idx, other_group_embeddings in enumerate(embedded_groups):
                if other_group_idx != group_idx:
                    other_group_offset = other_group_idx * 10000
                    other_similarities = cosine_similarity([X], other_group_embeddings)[0]
                    least_similar_idx = np.argmin(other_similarities)
                    least_similar_idxs.append(other_group_offset + least_similar_idx)

            anchor = all_pts[group_offset + X_idx]
            positive = all_pts[Y_idx]
            for idx in least_similar_idxs:
                negative = all_pts[idx]
                triplets.append((anchor, positive, negative))
    
    return triplets


In [29]:
def exception_pattern_sampling(embedded_groups, all_pts):
    triplets = []

    # Wrapping the outer loop with tqdm to track progress
    for group_idx, group_embeddings in tqdm(enumerate(embedded_groups), total=len(embedded_groups), desc="Processing groups"):
        group_offset = group_idx * 10000

        # Wrapping the inner loop with tqdm to track progress within each group
        for X_idx, X in tqdm(enumerate(group_embeddings), total=len(group_embeddings), desc=f"Group {group_idx}", leave=False):
            # Intra-Cluster에서 가장 유사도가 낮은 Positive 샘플링
            similarities = cosine_similarity([X], group_embeddings)[0]
            filtered_similarities = [(sim, idx) for idx, sim in enumerate(similarities) if sim < 0.9999]
            least_similar_local_Y_idx = min(filtered_similarities, key=lambda x: x[0])[1]
            least_Y_idx = group_offset + least_similar_local_Y_idx

            # Inter-Cluster에서 가장 유사도가 높은 Negative 샘플링
            most_similar_idxs = []
            for other_group_idx, other_group_embeddings in enumerate(embedded_groups):
                if other_group_idx != group_idx:
                    other_group_offset = other_group_idx * 10000
                    other_similarities = cosine_similarity([X], other_group_embeddings)[0]
                    most_similar_idx = np.argmax(other_similarities)
                    most_similar_idxs.append(other_group_offset + most_similar_idx)

            # Triplet 생성
            anchor = all_pts[group_offset + X_idx]
            positive = all_pts[least_Y_idx]
            for idx in most_similar_idxs:
                negative = all_pts[idx]
                triplets.append((anchor, positive, negative))

    return triplets


In [30]:
def additional_pattern_sampling(embedded_groups, all_pts):
    triplets = []

    # Wrapping the outer loop with tqdm to track progress
    for X_idx, X in tqdm(enumerate(all_pts), total=len(all_pts), desc="Processing all points"):
        # Anchor 설정
        anchor = X
        # Anchor의 임베딩 값 가져오기
        group_idx = X_idx // 10000  # 각 그룹의 크기가 10000으로 고정된 경우
        local_idx = X_idx % 10000
        embedded_X = embedded_groups[group_idx][local_idx]  # Anchor의 임베딩 값

        # Wrapping the inner loop with tqdm to track progress for each group
        for group_idx, group_embeddings in tqdm(enumerate(embedded_groups), total=len(embedded_groups), desc=f"Group {group_idx}", leave=False):
            group_offset = group_idx * 10000

            # Anchor 임베딩과 해당 그룹의 임베딩 비교
            similarities = cosine_similarity([embedded_X], group_embeddings)[0]

            # 가장 유사한 Positive 및 가장 유사하지 않은 Negative 찾기
            filtered_similarities = [(sim, idx) for idx, sim in enumerate(similarities) if idx != local_idx or group_idx != X_idx // 10000]
            positive_idx = max(filtered_similarities, key=lambda x: x[0])[1]  # Positive는 가장 유사한 것
            negative_idx = min(filtered_similarities, key=lambda x: x[0])[1]  # Negative는 가장 유사하지 않은 것

            # Positive와 Negative 설정
            ap = all_pts[group_offset + positive_idx]
            an = all_pts[group_offset + negative_idx]

            # Triplet 저장
            triplets.append((anchor, ap, an))

    return triplets


In [31]:
def make_dataloader(filename, n, base_ratio, exceptional_ratio, additional_ratio):
    # 데이터셋 로드
    base_dataset = load_datasets(f'datasets/triplets/base_{filename}_300k.pkl')
    exceptional_dataset = load_datasets(f'datasets/triplets/exceptional_{filename}_300k.pkl')
    additional_dataset = load_datasets(f'datasets/triplets/additional_{filename}_300k.pkl')

    # 각 데이터셋의 샘플 개수 계산
    total_ratio = base_ratio + exceptional_ratio + additional_ratio
    base_count = int(n * (base_ratio / total_ratio))
    exceptional_count = int(n * (exceptional_ratio / total_ratio))
    additional_count = int(n * (additional_ratio / total_ratio))

    # 각 데이터셋에서 샘플링
    sampled_base = random.sample(list(base_dataset), base_count)
    sampled_exceptional = random.sample(list(exceptional_dataset), exceptional_count)
    sampled_additional = random.sample(list(additional_dataset), additional_count)

    # 샘플링된 데이터 합치기
    combined_items = sampled_base + sampled_exceptional + sampled_additional

    # 데이터 로더 생성
    dataloader = DataLoader(combined_items, batch_size=2, shuffle=True)

    return dataloader


In [32]:
def save_model(model, tokenizer, dataLoader, num_epochs, criterion, optimizer, save_path):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataLoader:
            # 데이터 로드
            anchor_texts, positive_texts, negative_texts, aps, ans = batch

            # 텍스트를 토큰화하고 GPU/CPU로 이동
            anchor_inputs = tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
            positive_inputs = tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
            negative_inputs = tokenizer(negative_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

            # DistilBERT의 경우 디코더 입력 없이 임베딩 생성
            anchor_outputs = model(**anchor_inputs)
            positive_outputs = model(**positive_inputs)
            negative_outputs = model(**negative_inputs)

            # [CLS] 토큰의 임베딩 추출 또는 Mean pooling
            anchor_embeddings = anchor_outputs.last_hidden_state[:, 0, :]  # [CLS] 토큰
            positive_embeddings = positive_outputs.last_hidden_state[:, 0, :]
            negative_embeddings = negative_outputs.last_hidden_state[:, 0, :]

            # 추가 데이터 GPU로 이동
            aps = aps.to(device) if isinstance(aps, torch.Tensor) else torch.tensor(aps, dtype=torch.float32).to(device)
            ans = ans.to(device) if isinstance(ans, torch.Tensor) else torch.tensor(ans, dtype=torch.float32).to(device)

            # 손실 계산 및 역전파
            loss = criterion(anchor_embeddings, positive_embeddings, negative_embeddings, aps, ans)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 에포크 손실 기록
            total_loss += loss.item()

        # 평균 손실 출력
        avg_loss = total_loss / len(dataLoader)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss}")

    # 학습 완료 후 모델 저장
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

    return model


# Evaluation

In [None]:
import os
from transformers import AutoTokenizer, AutoModel

sizes = [20000, 30000, 40000, 50000, 100000]
lambda_value = 0.1
num_epochs = 4 
models = ['distilbert-base-uncased']
similarities = ['GED', 'MCS']
ratio = [0.6, 0.2, 0.2]

for similarity in similarities:
    for size in sizes:
        dataLoader = make_dataloader(similarity, size, ratio[0], ratio[1], ratio[2])
        for model in models:
            model_path = f"models/triplet/{similarity}/{size}/{model}_{lambda_value}.pth"
            
            if os.path.exists(model_path):
                print(f"'{model_path}' 파일이 존재합니다.")
                continue
            
            tokenizer = AutoTokenizer.from_pretrained(model)
            model = AutoModel.from_pretrained(model).to(device)
            
            criterion = HybridTripletLoss(margin=0.2, lambda_ap=lambda_value, lambda_an=lambda_value).to(device)
            optimizer = Adam(model.parameters(), lr=1e-5)

            save_model(model= model, tokenizer= tokenizer, dataLoader= dataLoader, 
                       num_epochs= num_epochs, criterion= criterion, optimizer=optimizer, 
                       save_path=model_path)

Datasets loaded from datasets/triplets/base_GED_300k.pkl
Datasets loaded from datasets/triplets/exceptional_GED_300k.pkl
Datasets loaded from datasets/triplets/additional_GED_300k.pkl
Epoch [1/4], Loss: 0.08664140985748942
Epoch [2/4], Loss: 0.07549505862656831
Epoch [3/4], Loss: 0.07279398863134584
Epoch [4/4], Loss: 0.07113945742245895
Model saved to models/triplet/10000/distilbert-base-uncased_0.1.pth
Datasets loaded from datasets/triplets/base_MCS_300k.pkl
Datasets loaded from datasets/triplets/exceptional_MCS_300k.pkl
Datasets loaded from datasets/triplets/additional_MCS_300k.pkl
'models/triplet/10000/distilbert-base-uncased_0.1.pth' 파일이 존재합니다.


In [None]:
## SBERT Training

sizes = [10000, 20000, 30000, 40000, 50000, 100000]
lambda_value = 0.1
num_epochs = 4 
similarities = ['GED', 'MCS']
ratio = [0.6, 0.2, 0.2]

for similarity in similarities:
    for size in sizes:
        ged_dataLoader = make_dataloader(similarity, size, ratio[0], ratio[1], ratio[2])

        # 모델 저장
        model_path = f"models/triplet/{similarity}/{size}/sbert_{lambda_value}.pth"
        if os.path.exists(model_path):
                print(f"'{model_path}' 파일이 존재합니다.")
                continue
            
        # 각 lambda_value 마다 새로운 SBERT 모델 인스턴스를 생성
        sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)
        
        criterion = HybridTripletLoss(margin=0.2, lambda_ap=lambda_value, lambda_an=lambda_value).to(device)
        optimizer = Adam(sbert_model.parameters(), lr=1e-5)

        # 최종 모델 훈련
        num_epochs = 4  # 최종 학습용 에포크 수를 설정
        for epoch in range(num_epochs):
            sbert_model.train()
            total_loss = 0
            for batch in ged_dataLoader:
                anchor_texts, positive_texts, negative_texts, ged_aps, ged_ans = batch

                # 텍스트 토크나이즈 및 인코딩
                anchor_inputs = sbert_model.tokenizer(anchor_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
                positive_inputs = sbert_model.tokenizer(positive_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
                negative_inputs = sbert_model.tokenizer(negative_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

                # SBERT의 forward 메서드를 사용하여 임베딩 얻기 (requires_grad=True 상태 유지)
                anchor_embeddings = sbert_model(anchor_inputs)['sentence_embedding']
                positive_embeddings = sbert_model(positive_inputs)['sentence_embedding']
                negative_embeddings = sbert_model(negative_inputs)['sentence_embedding']

                # GED 값을 텐서로 변환하여 장치로 이동
                ged_aps = torch.tensor(ged_aps, dtype=torch.float32).to(device)
                ged_ans = torch.tensor(ged_ans, dtype=torch.float32).to(device)

                # 손실 계산 및 최적화
                loss = criterion(anchor_embeddings, positive_embeddings, negative_embeddings, ged_aps, ged_ans)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # 손실 값 누적
                total_loss += loss.item()
            
            avg_loss = total_loss / len(ged_dataLoader)
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss}")

        torch.save(sbert_model.state_dict(), model_path)
        print(f"saved '{model_path}'")

Datasets loaded from datasets/triplets/base_GED_300k.pkl
Datasets loaded from datasets/triplets/exceptional_GED_300k.pkl
Datasets loaded from datasets/triplets/additional_GED_300k.pkl
'models/triplet/GED/10000/sbert_0.1.pth' 파일이 존재합니다.
Datasets loaded from datasets/triplets/base_GED_300k.pkl
Datasets loaded from datasets/triplets/exceptional_GED_300k.pkl
Datasets loaded from datasets/triplets/additional_GED_300k.pkl


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  ged_aps = torch.tensor(ged_aps, dtype=torch.float32).to(device)
  ged_ans = torch.tensor(ged_ans, dtype=torch.float32).to(device)


Epoch [1/4], Loss: 0.08127098804404959
Epoch [2/4], Loss: 0.07551383393928408


In [27]:
def embedding_sbert(tokenized_processes, model, device, batch_size=16):
    
    embeddings = []
    total_batches = (len(tokenized_processes) + batch_size - 1) // batch_size

    progress_bar = tqdm(total=total_batches, desc='Processing Batches', unit='batch')

    for start_idx in range(0, len(tokenized_processes), batch_size):
        end_idx = min(start_idx + batch_size, len(tokenized_processes))
        batch_tokens = tokenized_processes[start_idx:end_idx]
        batch_texts = [" ".join(tokens) for tokens in batch_tokens]
        embedding = model.encode(batch_texts, device=device, show_progress_bar=False)
        embeddings.extend(embedding)
        
        progress_bar.update(1)

    progress_bar.close()

    return embeddings

def embedding_sbert_groups(tokenized_groups, model, device):
    labes = []
    embedded_group = []

    for idx, group in enumerate(tokenized_groups):
        embeddings = embedding_sbert(group, model, device)
        embedded_group.append(embeddings)
        labes.extend([idx] * len(embeddings))

    return embedded_group

In [None]:
result = pd.DataFrame()
excel_path = "result/sbert.xlsx"

for similarity in similarities:
    for size in sizes:
        model_path = f"models/triplet/{similarity}/{size}/sbert_{lambda_value}.pth"

        sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)
        sbert_model.load_state_dict(torch.load(model_path, map_location=device))
        
        embedded_group = embedding_sbert_groups(tokenized_groups, sbert_model, device)

        sim_matrix = cosine_avg(embedded_group)
        upper_triangular_values = sim_matrix[np.triu_indices(sim_matrix.shape[0])]
        result[f"{size}_{lambda_value}"] = upper_triangular_values

        # 결과 파일 저장
        result.to_excel(excel_path, index=False)

        # 메모리 캐시 정리
        torch.cuda.empty_cache()

  sbert_model.load_state_dict(torch.load(model_path, map_location=device))
Processing Batches: 100%|██████████| 625/625 [00:04<00:00, 135.82batch/s]
Processing Batches: 100%|██████████| 625/625 [00:04<00:00, 135.25batch/s]
Processing Batches: 100%|██████████| 625/625 [00:04<00:00, 136.25batch/s]
Processing Batches: 100%|██████████| 625/625 [00:05<00:00, 105.14batch/s]
Processing Batches: 100%|██████████| 625/625 [00:05<00:00, 105.63batch/s]
Processing Batches: 100%|██████████| 625/625 [00:05<00:00, 105.42batch/s]


Similarity Matrix:
[[0.89056873 0.79436409 0.42394969 0.46762395 0.38591737 0.39168537]
 [0.         0.85286719 0.48204541 0.44736141 0.42740169 0.44041091]
 [0.         0.         0.77514237 0.41202009 0.41406024 0.50699753]
 [0.         0.         0.         0.87453431 0.77477473 0.55000061]
 [0.         0.         0.         0.         0.87842309 0.54353094]
 [0.         0.         0.         0.         0.         0.84269792]]


: 

In [None]:
lambda_values = [0.1, 0.2, 0.3, 0.4, 0.5]
data_sizes = [20000, 30000, 40000, 50000, 100000]
excel_path = "result/t5.xlsx"
result = pd.DataFrame()
models = ['bert-base-uncased', 'facebook/bart-base', 't5-small', 'distilbert-base-uncased']
# 각 lambda에 따른 모델 경로
model_path = f"models/triplet/10000/t5-small_0.1.pth"
bart_model = AutoModel.from_pretrained("t5-small").to(device)
bart_tokenizer = AutoTokenizer.from_pretrained("t5-small")

# 모델 가중치 로드
bart_model.load_state_dict(torch.load(model_path, map_location=device))
bart_model.eval()  # 평가 모드 전환
    
# 토크나이징이 끝난 입력 데이터(tokenized_groups)를 임베딩
embedded_group = embedding_groups(tokenized_groups, bart_model, bart_tokenizer)

# 코사인 유사도 계산
sim_matrix = cosine_avg(embedded_group)
upper_triangular_values = sim_matrix[np.triu_indices(sim_matrix.shape[0])]
result[f"10000_0.1"] = upper_triangular_values

# 결과 파일 저장
result.to_excel(excel_path, index=False)

  bart_model.load_state_dict(torch.load(model_path, map_location=device))


Similarity Matrix:
[[0.99997705 0.99968207 0.55333805 0.99967593 0.99928719 0.52614057]
 [0.         0.99954194 0.5535056  0.99938947 0.99922186 0.52625054]
 [0.         0.         0.48194835 0.55353689 0.55360228 0.46810117]
 [0.         0.         0.         0.99998802 0.99974322 0.52736914]
 [0.         0.         0.         0.         0.99962568 0.52741057]
 [0.         0.         0.         0.         0.         0.47039601]]
