In [1]:
# prompt: mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/code/VTHNKG-NT/"
# ======= 수정할 데이터셋 위치 설정 =======

/content/drive/MyDrive/code/VTHNKG-NT


### 설정

In [3]:
# Install tqdm if not already installed
!pip install tqdm

import torch
from transformers import BertTokenizer, BertModel
import os
import numpy as np
import pandas as pd
from tqdm import tqdm



In [None]:
# ========== 설정 ==========
# 기존 데이터 읽기
relations_txt = "relations.txt"
relation2id_txt = "relation2id.txt"
relation2text_txt = "relation2text.txt"
relation2textlong_txt = "relation2textlong.txt"
numeric_value_file = "numeric_values.txt"
triplets_file = "triplets.txt"

# 허용하는 relation 종류
allowed_relations = [
    "lengthAverage", "weightAverage", "lifespan", "legAmount"
]

output_file = "triplet_all.txt" # 새 triplet 파일 이름
output_dir = "output/"  # 새 파일 저장 경로
os.makedirs(output_dir, exist_ok=True)

# relation별 긴 설명 (코드 내 작성)
relation_explanations = {
    "lengthAverage": "average length of the entity measured in meters",
    "weightAverage": "average weight of the entity measured in kilograms",
    "lifespan": "typical life of the entity measured in years",
    "legAmount": "number of legs the entity usually has"
}

# BERT 모델 초기화
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# relation 별 설정값
# min-max: (min, max)
# z-score: (mean, std)
relation_settings_minmax = {
    "lengthAverage": (0.5, 30),
    "weightAverage": (0.5, 1000),
    "lifespan": (1, 200),
    "legAmount": (1, 20)
}

relation_settings_zscore = {
    "lengthAverage": (15, 5),
    "weightAverage": (500, 200),
    "lifespan": (50, 30),
    "legAmount": (4, 2)
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

### triplets.txt에 numeric_values.txt의 numeric triplets들을 normalize 후 추가

In [None]:
# 1. Numeric value 파일 읽기
def load_numeric_values(file_path):
    df = pd.read_csv(file_path, sep=r"\s+", header=None, names=["head", "relation", "value"], engine="python")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna()
    return df

# 2. 특정 relation만 필터링
def filter_relations(df, allowed_relations):
    return df[df["relation"].isin(allowed_relations)]

# 3. Min-Max Normalization
def min_max_normalize(series, min_val, max_val):
    clipped = np.clip(series, min_val, max_val)
    normalized = (clipped - min_val) / (max_val - min_val)
    return normalized

# 4. Z-Score Normalization
def z_score_normalize(series, mean_val, std_val):
    normalized = (series - mean_val) / std_val
    return normalized

# 5. Relation별 Normalization 적용
def normalize_per_relation(df, relation_settings, method="min-max"):
    normalized_triplets = []
    skipped_relations = []

    for relation, group in tqdm(df.groupby("relation")):
        setting = relation_settings.get(relation)
        if setting is None:
            skipped_relations.append(relation)
            continue  # 설정 없는 relation은 스킵

        values = group["value"]

        if method == "min-max":
            min_val, max_val = setting
            normalized = min_max_normalize(values, min_val, max_val)
        elif method == "z-score":
            mean_val, std_val = setting
            normalized = z_score_normalize(values, mean_val, std_val)
        else:
            raise ValueError("method must be 'min-max' or 'z-score'.")

        temp = group.copy()
        temp["value"] = normalized
        normalized_triplets.append(temp)

    if not normalized_triplets:
        raise ValueError(f"No relations were normalized! Check if your relation_settings match your data. Skipped relations: {skipped_relations}")

    normalized_df = pd.concat(normalized_triplets, ignore_index=True)
    return normalized_df

# 6. Triplets 저장
def save_triplets(triplet_path, numeric_df, output_path):
    with open(triplet_path, 'r') as f:
        triplet_lines = f.readlines()

    numeric_lines = [f"{row['head']}\t{row['relation']}\t{row['value']:.6f}\n" for _, row in numeric_df.iterrows()]

    with open(output_dir+output_path, 'w') as f:
        f.writelines(triplet_lines)
        f.writelines(numeric_lines)

# ================ 사용 ================

# 데이터 읽기
df = load_numeric_values(numeric_value_file)

# relation 필터링
filtered_df = filter_relations(df, allowed_relations)

# Normalize (방법 선택: "min-max" 또는 "z-score")
normalized_df = normalize_per_relation(filtered_df, relation_settings_minmax, method="min-max")

# triplets 파일과 병합하여 저장
save_triplets(triplets_file, normalized_df, output_file)

100%|██████████| 4/4 [00:00<00:00, 404.81it/s]


### 새롭게 추가된 Numeric Relation으로 Relation 파일 (4개) 및 textual feature 수정

In [None]:
# 새로 추가할 relation
new_relations = allowed_relations

In [None]:
# ========== 함수 정의 ==========

def load_existing_relations(relations_path):
    with open(relations_path, 'r') as f:
        return [line.strip() for line in f.readlines()]

def load_existing_relation2text(file_path):
    mapping = {}
    with open(file_path, 'r') as f:
        for line in f:
            rel, text = line.strip().split('\t')
            mapping[rel] = text
    return mapping

def generate_textual_feature(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu()

def save_list(filepath, items):
    with open(filepath, 'w') as f:
        for item in items:
            f.write(f"{item}\n")

def save_mapping(filepath, mapping):
    with open(filepath, 'w') as f:
        for key, value in mapping.items():
            f.write(f"{key}\t{value}\n")

# ========== 메인 플로우 ==========

# 1. 기존 relations 불러오기
existing_relations = load_existing_relations(relations_txt)
existing_relation2text = load_existing_relation2text(relation2text_txt)
existing_relation2textlong = load_existing_relation2text(relation2textlong_txt)

# 2. 새 relation 추가
total_relations = list(set(existing_relations + new_relations))
total_relations.sort()  # 알파벳 순 정렬

# 3. relation2id 생성
relation2id = {rel: idx for idx, rel in enumerate(total_relations)}

# 4. relation2text 생성
relation2text = {rel: existing_relation2text.get(rel, rel) for rel in total_relations}

# 5. relation2textlong 생성
relation2textlong = {rel: existing_relation2textlong.get(rel, relation_explanations.get(rel, "No description.")) for rel in total_relations}

# 6. 파일로 저장
save_list(os.path.join(output_dir, "relations.txt"), total_relations)
save_mapping(os.path.join(output_dir, "relation2id.txt"), relation2id)
save_mapping(os.path.join(output_dir, "relation2text.txt"), relation2text)
save_mapping(os.path.join(output_dir, "relation2textlong.txt"), relation2textlong)

# 7. textual_features_rel.pt 업데이트
relation_feat_file_old = "textual_features_rel.pt"
relation_feat_file_new = os.path.join(output_dir, "textual_features_rel.pt")

relation_features = torch.load(relation_feat_file_old) if os.path.exists(relation_feat_file_old) else {}

# 새 relation description들 BERT 임베딩 추가
for rel in new_relations:
    if rel not in relation_features:
        desc = relation2textlong[rel]
        relation_features[rel] = generate_textual_feature(desc)

# 저장
torch.save(relation_features, relation_feat_file_new)

print("✅ 모든 작업이 완료되었습니다.")


✅ 모든 작업이 완료되었습니다.


## triplet_all.txt 기반으로 train, valid, test 만들기

In [4]:
import random
random.seed(0)

def split_dataset(input_file, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1):
    assert abs(train_ratio + valid_ratio + test_ratio - 1.0) < 1e-6

    with open(input_file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    random.shuffle(lines)
    total = len(lines)

    train_end = int(total * train_ratio)
    valid_end = train_end + int(total * valid_ratio)

    train_lines = lines[:train_end]
    valid_lines = lines[train_end:valid_end]
    test_lines = lines[valid_end:]

    with open("train.txt", 'w', encoding='utf-8') as f:
        f.write('\n'.join(train_lines) + '\n')

    with open("valid.txt", 'w', encoding='utf-8') as f:
        f.write('\n'.join(valid_lines) + '\n')

    with open("test.txt", 'w', encoding='utf-8') as f:
        f.write('\n'.join(test_lines) + '\n')

    print(f"🔹 train: {len(train_lines)}줄, valid: {len(valid_lines)}줄, test: {len(test_lines)}줄 저장 완료.")

input = "triplet_all.txt"
split_dataset(input, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1)


🔹 train: 1286줄, valid: 160줄, test: 162줄 저장 완료.
