# 환경설정

In [1]:
import numpy as np
import os
import glob
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import re

folder_path = "/content/drive/MyDrive/PMLAB/datasets_BPIC"
file_path_list = glob.glob(os.path.join(folder_path, "*.txt"))
result_lists = []

for file_path in file_path_list:
    result_list = []
    print(file_path)
    with open(file_path, 'r') as file:
        next(file)
        for line in file:
            pattern = r'"(.*?)"'
            matches = re.findall(pattern, line, flags=0)
            matches = str(matches[0]).replace("->", "\u2192")
            if matches:
                result_list.append(matches.replace('"', ""))

    result_lists.append(result_list)

BPIC15_1 = result_lists[0]
BPIC15_2 = result_lists[1]
BPIC15_3 = result_lists[2]
BPIC15_4 = result_lists[3]
BPIC15_5 = result_lists[4]

/content/drive/MyDrive/PMLAB/datasets_BPIC/1_metadata.txt
/content/drive/MyDrive/PMLAB/datasets_BPIC/2_metadata.txt
/content/drive/MyDrive/PMLAB/datasets_BPIC/3_metadata.txt
/content/drive/MyDrive/PMLAB/datasets_BPIC/4_metadata.txt
/content/drive/MyDrive/PMLAB/datasets_BPIC/5_metadata.txt


In [4]:
print(len(BPIC15_1),len(BPIC15_2),len(BPIC15_3),len(BPIC15_4),len(BPIC15_5))

143 129 129 123 129


In [5]:
groups = [BPIC15_1,BPIC15_2,BPIC15_3,BPIC15_4,BPIC15_5]

In [6]:
cleaned_groups = []
for group in groups:
    cleaned_group = []
    cleaned_BPIC15 = [item.replace("'", "") for item in group]
    cleaned_groups.append(cleaned_BPIC15)

# Node Matching

In [7]:
import itertools
import random

def generate_two_letter_labels():
    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    labels = [''.join(pair) for pair in itertools.product(alphabet, repeat=2)]
    random.shuffle(labels)
    return labels

# 두 글자 알파벳 조합 생성
labels = generate_two_letter_labels()

In [8]:
def find_unique_node(groups):
    unique_nodes = set()
    for group in groups:
        for item in group:
            start_index = 0
            while True:
                start_quote = item.find("'", start_index)
                if start_quote == -1:
                    break
                end_quote = item.find("'", start_quote + 1)
                if end_quote == -1:
                    break
                node = item[start_quote + 1:end_quote]
                unique_nodes.add(node)
                start_index = end_quote + 1
    return unique_nodes

unique_nodes_list = []
unique_nodes_list = find_unique_node(groups)
unique_nodes_list = list(unique_nodes_list)

for node in unique_nodes_list:
    print(node)

print(len(unique_nodes_list))

creating decision permanently suspended
subcases completeness completed
retrieve missing data
inform BAG administrator
enter receipt of additional data
copy decision to extend procedure to stakeholders
no permit required for application
create subcases completeness
enter senddate decision procedure term extension
investigate BAG objects
treat subcases content
procedure change after completeness
decision date prior to decision
keep permanently suspended
generate letter no permit required
request complete
set decision phase no permit required decided
term 14 or 26 weeks
OLO messaging active
request to competent authority
MER required
phase forwarded to competent authority
applicant is stakeholder
terminate on request
phase archived case
application submitted through OLO
publish
registration date publication
read publication date field
stop all running subcases 2b
phase procedure prematurely terminated
create forwarding copy
send procedure confirmation
calculate provisional charges
regist

In [9]:
# 노드명과 알파벳 조합을 매칭할 딕셔너리 생성
node_to_label = {node: labels[i] for i, node in enumerate(unique_nodes_list)}

In [10]:
# 매칭 결과 출력
for node, label in node_to_label.items():
    print(f"{node} -> {label}")

creating decision permanently suspended -> KE
subcases completeness completed -> BZ
retrieve missing data -> WL
inform BAG administrator -> AC
enter receipt of additional data -> NX
copy decision to extend procedure to stakeholders -> XU
no permit required for application -> ZL
create subcases completeness -> BD
enter senddate decision procedure term extension -> SI
investigate BAG objects -> XS
treat subcases content -> RB
procedure change after completeness -> NY
decision date prior to decision -> LH
keep permanently suspended -> OQ
generate letter no permit required -> UF
request complete -> LX
set decision phase no permit required decided -> MK
term 14 or 26 weeks -> HD
OLO messaging active -> TM
request to competent authority -> SM
MER required -> AS
phase forwarded to competent authority -> GX
applicant is stakeholder -> JB
terminate on request -> DY
phase archived case -> DR
application submitted through OLO -> SQ
publish -> BF
registration date publication -> ET
read publicatio

In [11]:
# 실제 데이터를 사용하여 노드명을 알파벳 조합으로 교체하는 함수
def replace_nodes_with_labels(groups, node_to_label):
    replaced_groups = []
    for group in groups:
        replaced_group = []
        for item in group:
            replaced_item = item
            for node, label in node_to_label.items():
                replaced_item = re.sub(rf"'{re.escape(node)}'", f"'{label}'", replaced_item)
            replaced_item = replaced_item.replace('tau','0')
            replaced_group.append(replaced_item)
        replaced_groups.append(replaced_group)
    return replaced_groups

In [12]:
# 노드명을 알파벳 조합으로 교체
replaced_gropus = replace_nodes_with_labels(groups, node_to_label)

# Tokenizing

In [13]:
def tokenize_tree(tree):
    # 특수 문자를 이스케이프하여 정규 표현식 적용 + 괄호는 넣기
    special_characters = r"[\*\+\./:;<=\?\[\]\^_`{|}~\(\)]"
    tokenized_tree = re.findall(r"\b\w+\b|" + special_characters + r"|\u2192", str(tree))
    return tokenized_tree

# Tokenize each group
tokenized_1 = [tokenize_tree(tree) for tree in replaced_gropus[0]]
tokenized_2 = [tokenize_tree(tree) for tree in replaced_gropus[1]]
tokenized_3 = [tokenize_tree(tree) for tree in replaced_gropus[2]]
tokenized_4 = [tokenize_tree(tree) for tree in replaced_gropus[3]]
tokenized_5 = [tokenize_tree(tree) for tree in replaced_gropus[4]]

In [14]:
tokenized_all = []

# 토큰화된 트리들을 각 그룹에서 가져와서 통합
tokenized_all.extend(tokenized_1)
tokenized_all.extend(tokenized_2)
tokenized_all.extend(tokenized_3)
tokenized_all.extend(tokenized_4)
tokenized_all.extend(tokenized_5)
tokenized_group = [tokenized_1, tokenized_2, tokenized_3, tokenized_4, tokenized_5]

In [15]:
# 공통 어휘 구성
global_vocab = set()
for tokenized_processes in tokenized_all:
      global_vocab.update(tokenized_processes)
global_vocab = sorted(global_vocab)

print("vocab 길이 : ", len(global_vocab))
print("\n사용된 vocab : ", global_vocab)

vocab 길이 :  119

사용된 vocab :  ['(', ')', '*', '+', '0', 'AC', 'AD', 'AQ', 'AS', 'BD', 'BF', 'BZ', 'CT', 'CU', 'DM', 'DR', 'DY', 'EB', 'EG', 'ET', 'FF', 'FH', 'FI', 'GC', 'GT', 'GX', 'GZ', 'HD', 'HI', 'HS', 'HT', 'HW', 'HX', 'HY', 'IB', 'IE', 'IN', 'IO', 'IR', 'IX', 'JB', 'JC', 'JD', 'JG', 'JP', 'JV', 'KB', 'KD', 'KE', 'KS', 'KU', 'KZ', 'LH', 'LJ', 'LK', 'LU', 'LW', 'LX', 'LZ', 'MJ', 'MK', 'ML', 'MQ', 'NC', 'NX', 'NY', 'OC', 'OQ', 'OR', 'PC', 'PJ', 'PS', 'PT', 'PX', 'RA', 'RB', 'RE', 'RH', 'RY', 'SC', 'SD', 'SE', 'SH', 'SI', 'SM', 'SO', 'SQ', 'SR', 'SX', 'TB', 'TH', 'TM', 'UB', 'UD', 'UE', 'UF', 'UK', 'UP', 'VJ', 'VL', 'VO', 'VT', 'WL', 'WQ', 'X', 'XQ', 'XS', 'XU', 'XV', 'YI', 'YQ', 'YT', 'YX', 'ZH', 'ZK', 'ZL', 'ZP', 'ZS', '→']


# Bag-of-Words

In [None]:
def bag_of_words(tokenized_processes, global_vocab):
    vocab_list = sorted(list(global_vocab))
    bows = []

    for tokenized_process in tokenized_processes:
        process_vocab = {word: index for index, word in enumerate(vocab_list)}
        process_bow = np.zeros(len(vocab_list), dtype=int)

        for word in tokenized_process:
            word_index = process_vocab.get(word)
            process_bow[word_index] += 1

        bows.append(process_bow)

    return bows

In [None]:
bow_1, bow_2, bow_3, bow_4, bow_5 = [], [], [], [], []
for i, group in enumerate(tokenized_group):
    exec(f'bow_{i+1} = bag_of_words(group, global_vocab)')

# TF-IDF

In [None]:
len(replaced_gropus)

5

In [None]:
replaced_cleand_gropus = []
replaced_cleand_1 = [item.replace("'", "").replace(",", "") for item in replaced_gropus[0]]
replaced_cleand_2 = [item.replace("'", "").replace(",", "") for item in replaced_gropus[1]]
replaced_cleand_3 = [item.replace("'", "").replace(",", "") for item in replaced_gropus[2]]
replaced_cleand_4 = [item.replace("'", "").replace(",", "") for item in replaced_gropus[3]]
replaced_cleand_5 = [item.replace("'", "").replace(",", "") for item in replaced_gropus[4]]
replaced_cleand_gropus.extend([replaced_cleand_1, replaced_cleand_2, replaced_cleand_3, replaced_cleand_4, replaced_cleand_5])

In [None]:
replaced_cleand_all = []
for group in replaced_cleand_gropus:
    replaced_cleand_all.extend(group)

# 결과 출력
print(replaced_cleand_all)
print("길이 :",len(replaced_cleand_all))

['→( NA +( DI LP →( MM *( VH 0 ) ) ) LC X( 0 →( EP IM ) ) OE )', '→( NA DI LP MM *( VH 0 ) LC X( 0 →( EP IM ) ) OE )', '→( NA DI LP MM *( VH 0 ) LC EP IM OE )', '→( +( LP NA →( X( 0 EC ) +( X( 0 DI ) →( X( 0 →( BS X( 0 UB ) ) ) +( X( 0 FT ) →( X( 0 MM ) X( 0 EP ) X( 0 →( +( X( 0 OX ) SQ ) X( 0 →( +( LE FR ) +( MR CH ) ) ) ) ) X( 0 +( LC X( 0 KS ) →( +( *( VH 0 ) →( OE X( 0 →( X( 0 LZ ) X( 0 →( JX NU ) ) ) →( IO X( 0 HV IW MW ) ) ) X( 0 AA ) ) ) X( 0 PT ) ) ) ) ) ) ) ) ) ) X( 0 DS *( PI 0 ) MF ) )', '→( +( LP NA →( X( 0 EC ) X( 0 →( +( X( 0 OX ) SQ ) X( 0 →( +( LE FR ) +( MR CH ) ) ) ) ) +( LC X( 0 DI ) →( X( 0 →( BS X( 0 UB ) ) ) +( X( 0 FT ) →( X( 0 MM ) +( X( 0 KS ) →( +( *( VH 0 ) →( OE X( 0 →( X( 0 LZ ) X( 0 →( JX NU ) ) ) →( IO X( HV IW MW ) ) ) X( 0 AA ) ) ) X( 0 PT ) ) ) ) ) ) ) ) ) X( 0 DS *( PI 0 ) MF ) )', '→( +( LP →( X( 0 →( OX +( LE FR ) +( MR CH ) ) ) +( LC NA →( X( 0 EC ) X( 0 SQ ) +( X( 0 AA ) →( *( →( X( 0 FT ) X( 0 DI ) X( 0 MM ) +( *( VH 0 ) X( 0 →( OE X( 0 →( IO X( 

In [None]:
processtree_texts = [' '.join(tree) for tree in tokenized_all]

# 결과 출력
print(processtree_texts)
print("길이 :",len(processtree_texts))

['→ ( NA + ( DI LP → ( MM * ( VH 0 ) ) ) LC X ( 0 → ( EP IM ) ) OE )', '→ ( NA DI LP MM * ( VH 0 ) LC X ( 0 → ( EP IM ) ) OE )', '→ ( NA DI LP MM * ( VH 0 ) LC EP IM OE )', '→ ( + ( LP NA → ( X ( 0 EC ) + ( X ( 0 DI ) → ( X ( 0 → ( BS X ( 0 UB ) ) ) + ( X ( 0 FT ) → ( X ( 0 MM ) X ( 0 EP ) X ( 0 → ( + ( X ( 0 OX ) SQ ) X ( 0 → ( + ( LE FR ) + ( MR CH ) ) ) ) ) X ( 0 + ( LC X ( 0 KS ) → ( + ( * ( VH 0 ) → ( OE X ( 0 → ( X ( 0 LZ ) X ( 0 → ( JX NU ) ) ) → ( IO X ( 0 HV IW MW ) ) ) X ( 0 AA ) ) ) X ( 0 PT ) ) ) ) ) ) ) ) ) ) X ( 0 DS * ( PI 0 ) MF ) )', '→ ( + ( LP NA → ( X ( 0 EC ) X ( 0 → ( + ( X ( 0 OX ) SQ ) X ( 0 → ( + ( LE FR ) + ( MR CH ) ) ) ) ) + ( LC X ( 0 DI ) → ( X ( 0 → ( BS X ( 0 UB ) ) ) + ( X ( 0 FT ) → ( X ( 0 MM ) + ( X ( 0 KS ) → ( + ( * ( VH 0 ) → ( OE X ( 0 → ( X ( 0 LZ ) X ( 0 → ( JX NU ) ) ) → ( IO X ( HV IW MW ) ) ) X ( 0 AA ) ) ) X ( 0 PT ) ) ) ) ) ) ) ) ) X ( 0 DS * ( PI 0 ) MF ) )', '→ ( + ( LP → ( X ( 0 → ( OX + ( LE FR ) + ( MR CH ) ) ) + ( LC NA → ( X ( 0 EC 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# TF-IDF 모델 초기화
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b|[\(\)\'\*\+,/:;<=?\[\]^_`{|}~]|\u2192", lowercase=False)

# TF-IDF 변환
tfidf_matrix = tfidf_vectorizer.fit_transform(processtree_texts)

# 어휘 사전 가져오기
feature_names = tfidf_vectorizer.get_feature_names_out()

# 각 프로세스 트리에 대한 TF-IDF 벡터를 리스트로 추출
sentence_embeddings_tfidf = tfidf_matrix.toarray()

# 각 그룹의 크기 정의
group_sizes = [45, 41, 43, 45, 45]

tfidf_1, tfidf_2, tfidf_3, tfidf_4, tfidf_5 = [], [], [], [], []

# 데이터를 그룹 크기에 맞게 분할
start_idx = 0
for i, size in enumerate(group_sizes):
    exec(f'tfidf_{i+1} = sentence_embeddings_tfidf[start_idx:start_idx + size]')
    start_idx += size

# Word Embeddings

## 모델 정의

In [None]:
from gensim.models import Word2Vec

# Assuming 'tokenized_processtree' is a list of tokenized sentences
model_w2v = Word2Vec(sentences=tokenized_all, vector_size=64, window=4, workers=2, sg=0, epochs=100)

In [None]:
model_result = model_w2v.wv.most_similar("(")
print(model_result)

[('+', 0.5348647832870483), ('→', 0.5312145948410034), ('0', 0.4807100296020508), ('X', 0.4689379930496216), (')', 0.4623236060142517), ('NA', 0.42958569526672363), ('SQ', 0.37627583742141724), ('*', 0.36475831270217896), ('DI', 0.3583407402038574), ('KS', 0.31981268525123596)]


In [None]:
!pip install glove-python3

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 

In [None]:
from glove import Corpus, Glove

corpus = Corpus()

corpus.fit(tokenized_all, window=4)
model_glove = Glove(no_components=64, learning_rate=0.001)

model_glove.fit(corpus.matrix, epochs=100, no_threads=2, verbose=False)
model_glove.add_dictionary(corpus.dictionary)

In [None]:
print(model_glove.most_similar("("))

[(')', 0.9991187718216459), ('→', 0.9988327259554473), ('0', 0.9986878242916123), ('X', 0.9984320323604696)]


In [None]:
# word dict 생성
word_dict = {}
for word in  model_glove.dictionary.keys():
    word_dict[word] = model_glove.word_vectors[model_glove.dictionary[word]]
print('사용된 토큰 길이 : ', len(word_dict))

사용된 토큰 길이 :  119


In [None]:
from gensim.models import FastText

model_fasttext = FastText(sentences=tokenized_all, vector_size=64, window=4, workers=2, sg=0, epochs=100)

In [None]:
model_fasttext.wv.most_similar("(")

[(')', 0.4000214636325836),
 ('NA', 0.36268848180770874),
 ('0', 0.36235442757606506),
 ('SQ', 0.34771379828453064),
 ('DI', 0.344856321811676),
 ('→', 0.31985053420066833),
 ('+', 0.29683348536491394),
 ('KS', 0.2717045247554779),
 ('JQ', 0.26780301332473755),
 ('PT', 0.25666937232017517)]

In [None]:
def get_sentence_embeddings(tokenized_sentences, model):
    embeddings = []
    for sentence_tokens in tokenized_sentences:
        # Assuming you have a function named `sentence_embedding` for obtaining embeddings
        embedding = sentence_embedding(sentence_tokens, model)
        embeddings.append(embedding)
    return embeddings

## W2V

In [None]:
def sentence_embedding(sentence_tokens, model):
    # Initialize an empty array to store word vectors
    word_vectors = []

    # Iterate through the tokens in the sentence
    for token in sentence_tokens:
        if token in model.wv:
            # Keep the original 64-dimensional vector for each word
            word_vector = model.wv[token]
            word_vectors.append(word_vector)

    sentence_embedding = np.mean(word_vectors, axis=0, keepdims=False)
    return sentence_embedding

In [None]:
w2v_1 = get_sentence_embeddings(tokenized_1, model_w2v)
w2v_2 = get_sentence_embeddings(tokenized_2, model_w2v)
w2v_3 = get_sentence_embeddings(tokenized_3, model_w2v)
w2v_4 = get_sentence_embeddings(tokenized_4, model_w2v)
w2v_5 = get_sentence_embeddings(tokenized_5, model_w2v)

## glove

In [None]:
def sentence_embedding(tokens, model, embedding_dim=64):
    size = len(tokens)
    matrix = np.zeros((size, embedding_dim))
    word_table = word_dict
    for i, token in enumerate(tokens):
        vector = np.array([
            word_table[t] for t in token
            if t in word_table
        ])

        if vector.size != 0:
            final_vector = np.mean(vector, axis=0)
            matrix[i] = final_vector

    sentence_embedding = np.mean(matrix, axis=0)

    return sentence_embedding

In [None]:
glove_1 = get_sentence_embeddings(tokenized_1, model_w2v)
glove_2 = get_sentence_embeddings(tokenized_2, model_w2v)
glove_3 = get_sentence_embeddings(tokenized_3, model_w2v)
glove_4 = get_sentence_embeddings(tokenized_4, model_w2v)
glove_5 = get_sentence_embeddings(tokenized_5, model_w2v)

##fasttext

In [None]:
def sentence_embedding(sentence_tokens, model):
    word_vectors = []
    for token in sentence_tokens:
        if token in model.wv:
            # Keep the original 64-dimensional vector for each word
            word_vector = model.wv[token]
            word_vectors.append(word_vector)
    # Calculate the mean of word vectors along the first axis (axis=0)
    sentence_embedding = np.mean(word_vectors, axis=0)
    return sentence_embedding


In [None]:
fasttext_1 = get_sentence_embeddings(tokenized_1, model_fasttext)
fasttext_2 = get_sentence_embeddings(tokenized_2, model_fasttext)
fasttext_3 = get_sentence_embeddings(tokenized_3, model_fasttext)
fasttext_4 = get_sentence_embeddings(tokenized_4, model_fasttext)
fasttext_5 = get_sentence_embeddings(tokenized_5, model_fasttext)

## ELMO

In [None]:
pip install allennlp

Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m3

In [None]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 1, dropout=0)

In [None]:
# 함수 정의
def get_elmo_embedding(elmo, tokenized_data, device, batch_size=32):
    embeddings = []
    for i in range(0, len(tokenized_data), batch_size):
        batch = tokenized_data[i:i+batch_size]
        character_ids = batch_to_ids(batch).to(device)
        with torch.no_grad():
            emb_elmo = elmo(character_ids)
        elmo_representation = emb_elmo['elmo_representations'][0]
        elmo_avg = torch.mean(elmo_representation, dim=1)
        embeddings.append(elmo_avg.cpu().numpy())
    return np.vstack(embeddings)

In [None]:
elmo = elmo.to(device)

# 임베딩 얻기
elmo_1 = get_elmo_embedding(elmo, tokenized_1, device)
elmo_2 = get_elmo_embedding(elmo, tokenized_2, device)
elmo_3 = get_elmo_embedding(elmo, tokenized_3, device)
elmo_4 = get_elmo_embedding(elmo, tokenized_4, device)
elmo_5 = get_elmo_embedding(elmo, tokenized_5, device)

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
# Elmo 임베딩을 평균하여 1차원으로 축소
elmo_1 = np.mean(emb_elmo_1['elmo_representations'][0].detach().numpy(), axis=1)
elmo_2 = np.mean(emb_elmo_2['elmo_representations'][0].detach().numpy(), axis=1)
elmo_3 = np.mean(emb_elmo_3['elmo_representations'][0].detach().numpy(), axis=1)
elmo_4 = np.mean(emb_elmo_4['elmo_representations'][0].detach().numpy(), axis=1)
elmo_5 = np.mean(emb_elmo_5['elmo_representations'][0].detach().numpy(), axis=1)


# Feature Extraction with Pre-trained Model

## BERT

In [None]:
from transformers import BertTokenizer, BertModel
# BERT-cased 모델 로드
bert_model_name = 'bert-base-uncased' ## uncased가 더 좋은 결과
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name).to(device)

In [None]:
import torch
from transformers import LongformerModel, LongformerTokenizer
# Longformer 모델 및 토크나이저 불러오기
longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

def embed_processes_with_bert(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    longformer_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        # Longformer 토크나이저로 토큰화하고 입력 형식에 맞게 변환
        inputs = longformer_tokenizer(tree_tokens, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        inputs = inputs.to(device)

        # Longformer 모델에 입력
        with torch.no_grad():
            outputs = longformer_model(**inputs)

        # [CLS] 토큰의 임베딩 사용
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().squeeze().numpy()

        longformer_embeddings.append(cls_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 longformer_embeddings에 저장됨
    return longformer_embeddings

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
bert_embeddings_1 = []
bert_embeddings_2 = []
bert_embeddings_3 = []
bert_embeddings_4 = []
bert_embeddings_5 = []

for idx, group in enumerate(groups):
    bert_embeddings = embed_processes_with_bert(group)
    exec(f"bert_embeddings_{idx+1} = bert_embeddings")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

## GPT

In [None]:
import torch
from transformers import AutoTokenizer, GPTNeoModel

gpt_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
gpt_model =  GPTNeoModel.from_pretrained("EleutherAI/gpt-neo-1.3B").to(device)

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [None]:
def embed_processes_with_gpt(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    gpt_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        tokens = gpt_tokenizer(tree_tokens, return_tensors='pt').to(device)  # 텐서로 변환

        # GPT 모델에 입력
        with torch.no_grad():
            output = gpt_model(**tokens)

        # 마지막 토큰의 임베딩 사용
        last_token_embedding = output.last_hidden_state[:, -1, :].cpu().squeeze().numpy()

        gpt_embeddings.append(last_token_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 gpt_embeddings에 저장됨
    return gpt_embeddings

In [None]:
gpt_embeddings_1 = []
gpt_embeddings_2 = []
gpt_embeddings_3 = []
gpt_embeddings_4 = []
gpt_embeddings_5 = []

for idx, group in enumerate(groups):
    gpt_embeddings = embed_processes_with_gpt(group)
    exec(f"gpt_embeddings_{idx+1} = gpt_embeddings")

## SBERT

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2').to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed_processes_with_sbert(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    sbert_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:

        outputs = sbert_model.encode(tree_tokens)

        sbert_embeddings.append(outputs)

    # 모든 프로세스 트리에 대한 임베딩이 sbert_embeddings에 저장됨
    return sbert_embeddings

In [None]:
sbert_embeddings_1 = []
sbert_embeddings_2 = []
sbert_embeddings_3 = []
sbert_embeddings_4 = []
sbert_embeddings_5 = []

for idx, group in enumerate(groups):
    sbert_embeddings = embed_processes_with_sbert(group)
    exec(f"sbert_embeddings_{idx+1} = sbert_embeddings")

## T5

In [None]:
from transformers import T5Tokenizer, T5Model
# T5 모델 로드
t5_model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5Model.from_pretrained(t5_model_name).to(device)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
def embed_processes_with_t5(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    t5_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        # T5 모델에 입력
        inputs = t5_tokenizer(tree_tokens, return_tensors='pt', max_length=512).to(device)
        with torch.no_grad():
            output = t5_model(**inputs, decoder_input_ids=inputs["input_ids"])

        # 평균풀링 사용
        average_pooled_embedding = output.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()

        t5_embeddings.append(average_pooled_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 t5_embeddings에 저장됨
    return t5_embeddings

In [None]:
t5_embeddings_1 = []
t5_embeddings_2 = []
t5_embeddings_3 = []
t5_embeddings_4 = []
t5_embeddings_5 = []

for idx, group in enumerate(groups):
    t5_embeddings = embed_processes_with_t5(group)
    exec(f"t5_embeddings_{idx+1} = t5_embeddings")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## XLNET

In [None]:
from transformers import XLNetTokenizer, XLNetModel
# XLNet 모델 로드
xlnet_model_name = 'xlnet-base-cased'
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model_name)
xlnet_model = XLNetModel.from_pretrained(xlnet_model_name).to(device)

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [None]:
def embed_processes_with_xlnet(tokenized_processes):

    # 임베딩을 저장할 리스트 초기화
    xlnet_embeddings = []

    # tokenized_processes에 대한 반복문
    for tree_tokens in tokenized_processes:
        tokens = xlnet_tokenizer(tree_tokens, return_tensors='pt').to(device)  # 텐서로 변환

        # XLNet 모델에 입력
        with torch.no_grad():
            output = xlnet_model(**tokens)

        # 평균풀링 사용
        average_pooled_embedding = output.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()

        xlnet_embeddings.append(average_pooled_embedding)

    # 모든 프로세스 트리에 대한 임베딩이 xlnet_embeddings에 저장됨
    return xlnet_embeddings

In [None]:
xlnet_embeddings_1 = []
xlnet_embeddings_2 = []
xlnet_embeddings_3 = []
xlnet_embeddings_4 = []
xlnet_embeddings_5 = []

for idx, group in enumerate(groups):
    xlnet_embeddings = embed_processes_with_xlnet(group)
    exec(f"xlnet_embeddings_{idx+1} = xlnet_embeddings")

# Calculate Group Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cosine_avg(embeddings_groups):
    num_groups = len(embeddings_groups)
    similarity_matrix = np.zeros((num_groups, num_groups))

    for i in range(num_groups):
        for j in range(i, num_groups):
            embeddings1 = embeddings_groups[i]
            embeddings2 = embeddings_groups[j]

            similarity_matrix[i, j] = np.mean(cosine_similarity(embeddings1, embeddings2))

    # Print the similarity matrix
    print("Similarity Matrix:")
    print(similarity_matrix)

## BOW

In [None]:
groups_bow = [bow_1,bow_2,bow_3,bow_4,bow_5]
cosine_avg(groups_bow)

Similarity Matrix:
[[0.97700098 0.96667922 0.97179616 0.9652603  0.96189776]
 [0.         0.96862104 0.96738201 0.96244334 0.96127382]
 [0.         0.         0.97195487 0.96446619 0.96240733]
 [0.         0.         0.         0.96197212 0.95757581]
 [0.         0.         0.         0.         0.95994281]]


## TFIDF

In [None]:
groups_tfidf = [tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5]
cosine_avg(groups_tfidf)

Similarity Matrix:
[[0.94288499 0.94973147 0.94912053 0.91999087 0.91922628]
 [0.         0.97197318 0.96953662 0.93203517 0.93354336]
 [0.         0.         0.97998397 0.93192352 0.93198126]
 [0.         0.         0.         0.93360023 0.92318553]
 [0.         0.         0.         0.         0.93898358]]


##W2V

In [None]:
groups_w2v = [w2v_1,w2v_2,w2v_3,w2v_4,w2v_5]
cosine_avg(groups_w2v)

Similarity Matrix:
[[0.97609621 0.95655447 0.9581852  0.95824218 0.95361567]
 [0.         0.95052135 0.9398393  0.94685698 0.9448725 ]
 [0.         0.         0.95141304 0.94621205 0.94248617]
 [0.         0.         0.         0.95194358 0.94709897]
 [0.         0.         0.         0.         0.94975102]]


## glove

In [None]:
groups_glove = [glove_1,glove_2,glove_3,glove_4,glove_5]
cosine_avg(groups_glove)

Similarity Matrix:
[[0.99999754 0.99999708 0.99999738 0.99999686 0.99999668]
 [0.         0.99999762 0.99999747 0.99999711 0.999997  ]
 [0.         0.         0.99999754 0.99999705 0.99999696]
 [0.         0.         0.         0.99999675 0.99999654]
 [0.         0.         0.         0.         0.99999659]]


## fasttext

In [None]:
groups_fasttext = [fasttext_1,fasttext_2,fasttext_3,fasttext_4,fasttext_5]
cosine_avg(groups_fasttext)

Similarity Matrix:
[[0.98109806 0.96639484 0.9683215  0.96567547 0.96381241]
 [0.         0.9634465  0.9556582  0.95879936 0.95817357]
 [0.         0.         0.96423036 0.95811093 0.95712245]
 [0.         0.         0.         0.9603368  0.95784849]
 [0.         0.         0.         0.         0.96105027]]


## ELMO

In [None]:
groups_elmo = [elmo_1,elmo_2,elmo_3,elmo_4,elmo_5]
cosine_avg(groups_elmo)

NameError: name 'elmo_1' is not defined

## BERT

In [None]:
embeddings_groups_bert = [bert_embeddings_1,bert_embeddings_2,bert_embeddings_3,bert_embeddings_4,bert_embeddings_5]
cosine_avg(embeddings_groups_bert)

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## GPT

In [None]:
embeddings_groups_gpt = [gpt_embeddings_1,gpt_embeddings_2,gpt_embeddings_3,gpt_embeddings_4,gpt_embeddings_5]
cosine_avg(embeddings_groups_gpt)

Similarity Matrix:
[[0.98155779 0.97685158 0.97765905 0.97807389 0.97296262]
 [0.         0.9795391  0.97847605 0.97785914 0.9711498 ]
 [0.         0.         0.98027295 0.97916967 0.97258568]
 [0.         0.         0.         0.98182791 0.97367275]
 [0.         0.         0.         0.         0.9694609 ]]


## SBERT

In [None]:
embeddings_groups_sbert = [sbert_embeddings_1,sbert_embeddings_2,sbert_embeddings_3,sbert_embeddings_4,sbert_embeddings_5]
cosine_avg(embeddings_groups_sbert)

Similarity Matrix:
[[0.92023325 0.86616898 0.90514165 0.88844919 0.86614048]
 [0.         0.8712036  0.87802964 0.87374747 0.85484117]
 [0.         0.         0.92020059 0.88793856 0.87704664]
 [0.         0.         0.         0.91882586 0.84585416]
 [0.         0.         0.         0.         0.87913978]]


## T5

In [None]:
embeddings_groups_t5 = [t5_embeddings_1,t5_embeddings_2,t5_embeddings_3,t5_embeddings_4,t5_embeddings_5]
cosine_avg(embeddings_groups_t5)

Similarity Matrix:
[[0.99786216 0.99771446 0.99752259 0.99775594 0.99777436]
 [0.         0.99827981 0.99797052 0.99783897 0.99798518]
 [0.         0.         0.9978534  0.99758571 0.99776858]
 [0.         0.         0.         0.99795431 0.99786109]
 [0.         0.         0.         0.         0.9980424 ]]


## XLNET

In [None]:
embeddings_groups_xlnet = [xlnet_embeddings_1,xlnet_embeddings_2,xlnet_embeddings_3,xlnet_embeddings_4,xlnet_embeddings_5]
cosine_avg(embeddings_groups_xlnet)

Similarity Matrix:
[[0.99975538 0.99962407 0.9996832  0.99973768 0.99964458]
 [0.         0.99966735 0.9996528  0.99964273 0.99957192]
 [0.         0.         0.99972546 0.99971879 0.999623  ]
 [0.         0.         0.         0.99981344 0.99967772]
 [0.         0.         0.         0.         0.99962115]]
