# Setting

In [1]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
import numpy as np

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
device

device(type='cpu')

In [3]:
# SBERT 모델 로드
sbert_model_name = 'distilbert-base-nli-mean-tokens'
sbert_model = SentenceTransformer(sbert_model_name).to(device)



In [21]:
import pickle
import os

def load_process_trees(filenames):
    process_trees_list = []
    for filename in filenames:
        # 현재 작업 디렉토리를 기준으로 경로를 설정
        path = f'C:/Users/82105/Desktop/ProcessTree-Embedding/First Case/datasets/firstcase_10k_trees/firstcase_10k_trees/{filename}'
        with open(path, 'rb') as file:
            process_trees = pickle.load(file)
        process_trees_list.append(process_trees)
    return process_trees_list


In [22]:
# 파일 이름 리스트
filenames = [
    'processtree_naive.pkl',
    'processtree_simple.pkl',
    'processtree_concise.pkl',
    'processtree_straightforward.pkl',
    'processtree_complex.pkl',
    'processtree_spaghetti.pkl'
]
# 파일에서 불러오기
processtree_list = load_process_trees(filenames)

In [25]:
processtree_texts = [[str(tree) for tree in group] for group in processtree_list]

In [26]:
import re

def tokenize_tree(tree):
    # 특수 문자를 이스케이프하여 정규 표현식 적용 + 괄호는 넣기
    special_characters = r"[\*\+\./:;<=\?\[\]\^_`{|}~\(\)]"
    tokenized_tree = re.findall(r"\b\w+\b|" + special_characters + r"|\u2192", str(tree).replace("->", "\u2192"))
    return tokenized_tree

# Tokenize each group using processtree_list
tokenized_processtrees = [tokenize_tree(tree) for process_list in processtree_list for tree in process_list]
# 전체 프로세스 트리를 100개씩 끊어 6개의 그룹에 나누기
chunk_size = 10000
tokenized_groups = [tokenized_processtrees[i:i+chunk_size] for i in range(0, len(tokenized_processtrees), chunk_size)]

In [27]:
all_process_trees = [" ".join(tree) for tree in tokenized_processtrees]

In [28]:
# 전체 프로세스 트리를 100개씩 끊어 6개의 그룹에 나누기
chunk_size = 10000
origin_groups = [all_process_trees[i:i+chunk_size] for i in range(0, len(all_process_trees), chunk_size)]

In [29]:
import torch
from torch.optim import Adam
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer

# Active Learning

In [30]:
def embed_processes_with_sbert(tokenized_processes):

    sbert_embeddings = []

    for tree_tokens in tokenized_processes:
        process_tree_text = " ".join(tree_tokens)
        sbert_embedding = sbert_model.encode([process_tree_text])
        average_pooled_embedding = np.mean(sbert_embedding, axis=0)
        sbert_embeddings.append(average_pooled_embedding)

    return sbert_embeddings

In [31]:
clusters = []
texts = []
sbert_embeddings_1 = []
sbert_embeddings_2 = []
sbert_embeddings_3 = []
sbert_embeddings_4 = []
sbert_embeddings_5 = []
sbert_embeddings_6 = []
for idx, group in enumerate(processtree_texts):
    sbert_embeddings = embed_processes_with_sbert(group)
    # 원본 텍스트와 임베딩 저장
    texts.extend(group)
    if idx == 0:
        sbert_embeddings_1 = sbert_embeddings
    elif idx == 1:
        sbert_embeddings_2 = sbert_embeddings
    elif idx == 2:
        sbert_embeddings_3 = sbert_embeddings
    elif idx == 3:
        sbert_embeddings_4 = sbert_embeddings
    elif idx == 4:
        sbert_embeddings_5 = sbert_embeddings
    elif idx == 5:
        sbert_embeddings_6 = sbert_embeddings

    # 클러스터 레이블 추가
    clusters.extend([idx] * len(sbert_embeddings))

# 모든 임베딩을 하나의 리스트로 병합
all_embeddings = sbert_embeddings_1 + sbert_embeddings_2 + sbert_embeddings_3 + sbert_embeddings_4 + sbert_embeddings_5 + sbert_embeddings_6

KeyboardInterrupt: 

In [None]:
import numpy as np

def select_diverse_samples(embeddings, clusters, num_samples_per_cluster=10, num_intercluster_samples=10):
    # 클러스터별 중심 계산
    cluster_centers = {}
    for cluster_id in set(clusters):
        cluster_embeddings = [embeddings[i] for i in range(len(embeddings)) if clusters[i] == cluster_id]
        cluster_center = np.mean(cluster_embeddings, axis=0)
        cluster_centers[cluster_id] = cluster_center

    diverse_samples = []

    # 동일 클러스터 내에서 거리가 먼 샘플 선택
    for cluster_id, center in cluster_centers.items():
        cluster_samples = [i for i in range(len(embeddings)) if clusters[i] == cluster_id]
        distances = [(i, np.linalg.norm(embeddings[i] - center)) for i in cluster_samples]
        distances.sort(key=lambda x: x[1], reverse=True)  # 거리가 먼 순으로 정렬
        diverse_samples.extend([sample[0] for sample in distances[:num_samples_per_cluster]])

    # 다른 클러스터의 중심과의 거리가 가까운 샘플 선택
    for cluster_id, center in cluster_centers.items():
        inter_cluster_samples = []
        for other_cluster_id, other_center in cluster_centers.items():
            if cluster_id != other_cluster_id:
                # 다른 클러스터에 속하는 데이터 포인트와 현재 클러스터 중심 간의 거리 계산
                other_cluster_samples = [i for i in range(len(embeddings)) if clusters[i] == other_cluster_id]
                distances = [(i, np.linalg.norm(embeddings[i] - center)) for i in other_cluster_samples]
                inter_cluster_samples.extend(distances)

        # 거리 순으로 정렬 후 상위 num_intercluster_samples 샘플 선택
        inter_cluster_samples.sort(key=lambda x: x[1])
        diverse_samples.extend([sample[0] for sample in inter_cluster_samples[:num_intercluster_samples]])

    # 중복 제거 후 반환
    diverse_samples = list(set(diverse_samples))
    return diverse_samples


def combined_diversity(embeddings, clusters, num_samples_per_cluster=10, num_intercluster_samples=10):
    # diversity 측정
    diverse_sample_indices = select_diverse_samples(embeddings, clusters, num_samples_per_cluster, num_intercluster_samples)

    # diversity score 계산
    return [1 if i in diverse_sample_indices else 0 for i in range(len(embeddings))]

def sampling_diversity(embeddings, clusters, texts, num_samples_per_cluster=10, num_intercluster_samples=10):
    # 다양한 샘플 인덱스 선택
    diverse_sample_indices = select_diverse_samples(embeddings, clusters, num_samples_per_cluster, num_intercluster_samples)

    # 딕셔너리로 반환
    diverse_samples_dict = {}
    for idx in diverse_sample_indices:
        diverse_samples_dict[idx] = {
            "text": texts[idx],
            "embedding": embeddings[idx],
            "cluster_label": clusters[idx]
        }

    return diverse_samples_dict


In [None]:
diverse_samples = sampling_diversity(all_embeddings, clusters, texts, 100, 100)

In [21]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.preprocessing import normalize
# from scipy.stats import entropy

# # 불확실성 점수를 계산하는 함수들
# def least_confidence(similarity):
#     return abs(similarity - 0.5)  # 0.5에 가까운 값이 가장 불확실

# def margin_sampling(similarities):
#     sorted_similarities = sorted(similarities, reverse=True)
#     return sorted_similarities[0] - sorted_similarities[1]  # 차이가 작은 쌍을 선택

# def calculate_entropy(similarities):
#     return entropy(similarities)

# def combined_uncertainty(similarities):
#     lc_scores = normalize([least_confidence(similarity) for similarity in similarities])
#     margin_scores = normalize([margin_sampling(similarities)])
#     entropy_scores = normalize([calculate_entropy(similarities)])
#     return [lc + margin + entropy for lc, margin, entropy in zip(lc_scores, margin_scores, entropy_scores)]

# def compute_similarity_matrix(embeddings):
#     """ 임베딩 간의 코사인 유사도 행렬 계산 """
#     return cosine_similarity(embeddings)

# def extract_similarities(similarity_matrix):
#     """ 유사성 행렬에서 상삼각 행렬의 값을 추출하여 리스트로 반환 """
#     num_samples = similarity_matrix.shape[0]
#     similarities = []
#     for i in range(num_samples):
#         for j in range(i + 1, num_samples):
#             similarities.append(similarity_matrix[i, j])
#     return similarities

# def sample_top_n(embeddings, uncertainty_scores, n=10):
#     """
#     상위 N개의 샘플을 선택합니다.

#     :param embeddings: 모든 임베딩 벡터의 리스트
#     :param uncertainty_scores: 각 임베딩 벡터의 불확실성 점수
#     :param n: 선택할 샘플의 수
#     :return: 선택된 샘플의 인덱스 리스트
#     """
#     # 불확실성 점수가 높은 상위 N개 인덱스 선택
#     top_n_indices = np.argsort(uncertainty_scores)[-n:]
#     return top_n_indices

# def create_sampled_dataset(embeddings, indices):
#     """
#     선택된 샘플 인덱스를 기반으로 데이터셋을 생성합니다.

#     :param embeddings: 모든 임베딩 벡터의 리스트
#     :param indices: 선택된 샘플의 인덱스 리스트
#     :return: 선택된 샘플의 리스트
#     """
#     sampled_data = [embeddings[i] for i in indices]
#     return sampled_data

# embeddings = np.array(all_embeddings)  # 임베딩을 np.array로 변환

# # 유사성 행렬 계산
# similarity_matrix = compute_similarity_matrix(embeddings)

# # 유사성 값 추출
# similarities = extract_similarities(similarity_matrix)

# # 불확실성 점수 계산
# uncertainty_scores = combined_uncertainty(similarities)

# GED similarity fine-tuning

In [1]:
pip install zss

Collecting zss
  Downloading zss-1.2.0.tar.gz (9.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: zss
  Building wheel for zss (setup.py) ... [?25l[?25hdone
  Created wheel for zss: filename=zss-1.2.0-py3-none-any.whl size=6726 sha256=12dac682f6a187e53c8d8c332dc9fa96ac1c3d6ed04c94b22647391a5d668022
  Stored in directory: /root/.cache/pip/wheels/f6/61/2a/cf33ab7301cc318a13418d9a805c1832be561b46e7d9337625
Successfully built zss
Installing collected packages: zss
Successfully installed zss-1.2.0


In [2]:
import networkx as nx
import matplotlib.pyplot as plt
from zss import simple_distance, distance
from zss import Node

class TreeNode:
    def __init__(self, value):
        self.value = value
        self.children = []

    def add_child(self, child):
        self.children.append(child)

    def print_tree(self, depth=0):
        print("  " * depth + self.value)
        for child in self.children:
            child.print_tree(depth + 1)

    def to_zss_node(self):
        if not self.children:
            return Node(self.value, [])
        else:
            children_nodes = [child.to_zss_node() for child in self.children]
            return Node(self.value, children_nodes)

    def count_nodes(self):
        # 현재 노드를 포함하여 노드 수 계산
        count = 1
        for child in self.children:
            count += child.count_nodes()
        return count

def text_to_tree(sentence):
    stack = []
    root = TreeNode(None)
    sentence = sentence.replace(" ", "")
    for i in range(len(sentence)):
        if sentence[i] != '(' and sentence[i] != ')':
            cur = sentence[i]
            child = TreeNode(cur)
            root.add_child(child)
        elif sentence[i] == '(':
            stack.append(child)
            root = child
        else:
            stack.pop()
            if stack != []:
                root = stack[-1]
    # tree = root.to_zss_node()
    return root

def tree_edit_distance(graph1, graph2):
    tree1 = graph1.to_zss_node()
    tree2 = graph2.to_zss_node()
    dist = simple_distance(tree1, tree2)
    return dist

def max_edit_distance(graph1, graph2):
    max1 = graph1.count_nodes()
    max2 = graph2.count_nodes()
    dist = max1 + max2
    return dist

def graph_edit_similarity(sentence1, sentence2):
    graph1 = text_to_tree(sentence1)
    graph2 = text_to_tree(sentence2)

    ged = tree_edit_distance(graph1, graph2)
    maxged = max_edit_distance(graph1, graph2)

    sim = 1 - (ged / maxged)

    return sim

In [29]:
from tqdm import tqdm

def make_dataset(pts):
    data = []
    total_iterations = len(pts) * (len(pts) + 1) // 2
    progress_bar = tqdm(total=total_iterations, desc='Progress', unit=' pairs')

    for i in range(len(pts)):
        for j in range(i, len(pts)):
            seq1 = pts[i]
            seq2 = pts[j]
            similarity = graph_edit_similarity(seq1, seq2)
            new_data = InputExample(texts=[seq1, seq2], label=similarity)
            data.append(new_data)
            progress_bar.update(1)  # 반복마다 진행률 업데이트

    progress_bar.close()
    return data

In [30]:
# diverse_samples_with_texts에서 text 값들만 리스트로 추출
texts_list = [item['text'] for item in diverse_samples.values()]

# make_dataset 함수에 texts_list를 전달하여 사용
datasets_diversity = make_dataset(texts_list)

Progress:   0%|          | 249/699153 [08:36<609:44:43,  3.14s/ pairs]

KeyboardInterrupt: 

In [None]:
datasets = load_files('datasets.pkl')

In [None]:
for i in range(len(datasets)):
    datasets[i].label = np.float32(datasets[i].label)

In [None]:
from sklearn.model_selection import train_test_split

# 데이터셋을 train과 나머지로 나눔
train_data, temp_data = train_test_split(datasets, test_size=0.2, random_state=42)

# 나머지를 test와 eval로 나눔 (전체 데이터셋의 20%를 각각의 test와 eval로 사용)
test_data, eval_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [None]:
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Train Dataloader
train_dataloader = DataLoader(
    train_data,
    shuffle=True,
    batch_size=32,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    eval_data,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    test_data,
    name="sts-test",
)


In [None]:
from sentence_transformers import SentenceTransformer, models

# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path="deepset/roberta-base-squad2",
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)
model.to(device)

# warmup steps
warmup_steps = math.ceil(len(datasets) * 4 / 32 * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=4,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

Iteration:   0%|          | 0/46 [00:00<?, ?it/s]

In [None]:
def embed_processes_with_sbert(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    sbert_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        # 프로세스 트리를 텍스트로 펼치고 토큰화
        process_tree_text = " ".join(tree_tokens)

        # SBERT 모델을 사용하여 문장 임베딩 얻기
        sbert_embedding = model.encode([process_tree_text])

        # 평균 풀링
        average_pooled_embedding = np.mean(sbert_embedding, axis=0)
        sbert_embeddings.append(average_pooled_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 sbert_embeddings에 저장됨
    return sbert_embeddings

In [None]:
groups = [tokenized_processtree_naive, tokenized_processtree_simple, tokenized_processtree_concise, tokenized_processtree_straightforward, tokenized_processtree_complex, tokenized_processtree_spaghetti]

SSL_sbert_embeddings_naive = []
SSL_sbert_embeddings_simple = []
SSL_sbert_embeddings_concise = []
SSL_sbert_embeddings_straightforward = []
SSL_sbert_embeddings_complex = []
SSL_sbert_embeddings_spaghetti = []

for idx, group in enumerate(groups):
    sbert_embeddings = embed_processes_with_sbert(group)

    # 그룹별로 결과를 저장
    if idx == 0:
        SSL_sbert_embeddings_naive = sbert_embeddings
    elif idx == 1:
        SSL_sbert_embeddings_simple = sbert_embeddings
    elif idx == 2:
        SSL_sbert_embeddings_concise = sbert_embeddings
    elif idx == 3:
        SSL_sbert_embeddings_straightforward = sbert_embeddings
    elif idx == 4:
        SSL_sbert_embeddings_complex = sbert_embeddings
    elif idx == 5:
        SSL_sbert_embeddings_spaghetti = sbert_embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cosine_avg(embeddings_groups):
    num_groups = len(embeddings_groups)
    similarity_matrix = np.zeros((num_groups, num_groups))

    for i in range(num_groups):
        for j in range(num_groups):
            embeddings1 = embeddings_groups[i]
            embeddings2 = embeddings_groups[j]

            similarity_matrix[i, j] = np.mean(cosine_similarity(embeddings1, embeddings2))

    # Print the similarity matrix
    upper_triangular_matrix = np.triu(similarity_matrix)
    print(upper_triangular_matrix)

In [None]:
embeddings_groups_sbert = [SSL_sbert_embeddings_naive, SSL_sbert_embeddings_simple, SSL_sbert_embeddings_concise, SSL_sbert_embeddings_straightforward, SSL_sbert_embeddings_complex, SSL_sbert_embeddings_spaghetti]
cosine_avg(embeddings_groups_sbert)
# 18000개

[[0.9283703  0.88277519 0.82304925 0.76180714 0.75690877 0.73085809]
 [0.         0.88650751 0.84747577 0.73102343 0.7445876  0.73215872]
 [0.         0.         0.87726778 0.70326018 0.72004384 0.73560584]
 [0.         0.         0.         0.9750666  0.94939345 0.89744627]
 [0.         0.         0.         0.         0.95513743 0.92360681]
 [0.         0.         0.         0.         0.         0.95630032]]


In [None]:
embeddings_groups_sbert = [SSL_sbert_embeddings_naive, SSL_sbert_embeddings_simple, SSL_sbert_embeddings_concise, SSL_sbert_embeddings_straightforward, SSL_sbert_embeddings_complex, SSL_sbert_embeddings_spaghetti]
cosine_avg(embeddings_groups_sbert)
# 1800개

[[0.98808557 0.94386983 0.80180001 0.78815114 0.77369833 0.60258859]
 [0.         0.95176208 0.87772673 0.75067461 0.76278567 0.65729588]
 [0.         0.         0.94869655 0.664976   0.70800954 0.73169202]
 [0.         0.         0.         0.98732239 0.96194553 0.82284719]
 [0.         0.         0.         0.         0.96386105 0.86725622]
 [0.         0.         0.         0.         0.         0.94519275]]
