In [3]:
import pandas as pd
import io

In [2]:
df = pd.read_csv(r'norm_IndovestDKG_encoded_indovest.csv')
df.head()

Unnamed: 0,subject_entity_id,relation_id,object_entity_id,temporal_id
0,0,0,10203,0
1,1,1,244,0
2,1,1,16606,0
3,1,1,16607,0
4,1,1,3573,0


In [5]:
def split_dataframe_temporal_by_ratio(df, temporal_column, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    if not (0 <= train_ratio <= 1 and 0 <= val_ratio <= 1 and 0 <= test_ratio <= 1):
        raise ValueError("Rasio (train_ratio, val_ratio, test_ratio) harus antara 0 dan 1.")
    if (train_ratio + val_ratio + test_ratio) > 1.001:
        print("Peringatan: Jumlah rasio melebihi 1.0. Set test akan disesuaikan untuk memastikan total 100%.")

    df_sorted = df.sort_values(by=temporal_column).reset_index(drop=True)

    total_size = len(df_sorted)
    
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    
    test_size = total_size - train_size - val_size 

    train_df = df_sorted.iloc[:train_size]
    val_df = df_sorted.iloc[train_size : train_size + val_size]
    test_df = df_sorted.iloc[train_size + val_size : train_size + val_size + test_size]

    return train_df, val_df, test_df

df = pd.read_csv(r'norm_IndovestDKG_encoded_indovest.csv')

TRAIN_RATIO = 0.80
VALID_RATIO = 0.10
TEST_RATIO = 0.10 

train_df, val_df, test_df = split_dataframe_temporal_by_ratio(
    df,
    temporal_column='temporal_id',
    train_ratio=TRAIN_RATIO,
    val_ratio=VALID_RATIO,
    test_ratio=TEST_RATIO
)

print(f"Total data dalam DataFrame awal: {len(df)}")
print("---")
print(f"Training Set: {len(train_df)} ({len(train_df)/len(df)*100:.2f}%)")
print(f"Validation Set: {len(val_df)} ({len(val_df)/len(df)*100:.2f}%)")
print(f"Test Set: {len(test_df)} ({len(test_df)/len(df)*100:.2f}%)")
print("---")


Total data dalam DataFrame awal: 68207
---
Training Set: 54565 (80.00%)
Validation Set: 6820 (10.00%)
Test Set: 6822 (10.00%)
---


In [6]:
train_df.to_csv(r'splited/train.csv', index=False)
val_df.to_csv(r'splited/valid.csv', index=False)
test_df.to_csv(r'splited/test.csv', index=False)

# formating

In [None]:
import pandas as pd
import os
from pathlib import Path

def convert_data_to_txt_format():
    current_dir = Path('.')
    
    base_dir = current_dir.parent / 'result'
    splited_dir = base_dir / 'splited'
    output_dir = splited_dir / 'final_txt_format'
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    conversions = [
        (splited_dir / 'train.csv', output_dir / 'train.txt', 'space_separated'),
        (splited_dir / 'valid.csv', output_dir / 'valid.txt', 'space_separated'),
        (splited_dir / 'test.csv', output_dir / 'test.txt', 'space_separated'),
        (base_dir / 'norm_IndovestDKG_encoded_entity_id.csv', output_dir / 'entity2id.txt', 'entity_mapping'),
        (base_dir / 'norm_IndovestDKG_relation_id.csv', output_dir / 'relation2id.txt', 'relation_mapping')
    ]
    
    for input_file, output_file, conversion_type in conversions:
        if not input_file.exists():
            continue
            
        if conversion_type == 'space_separated':
            df = pd.read_csv(input_file)
            df.to_csv(output_file, sep=' ', index=False, header=False)
            
        elif conversion_type == 'entity_mapping':
            df = pd.read_csv(input_file)
            with open(output_file, 'w', encoding='utf-8') as f:
                for _, row in df.iterrows():
                    f.write(f"{row['entity_name']}\t{row['entity_id']}\n")
                    
        elif conversion_type == 'relation_mapping':
            df = pd.read_csv(input_file)
            with open(output_file, 'w', encoding='utf-8') as f:
                for _, row in df.iterrows():
                    f.write(f"{row['relation']}\t{row['relation_id']}\n")

def auto_detect_and_convert():
    current_dir = Path('.')
    
    possible_bases = [
        current_dir,
        current_dir.parent,
        current_dir / 'experiments' / 'result',
        current_dir.parent / 'result'
    ]
    
    base_dir = None
    for possible_base in possible_bases:
        if (possible_base / 'splited' / 'train.csv').exists():
            base_dir = possible_base
            break
    
    if base_dir is None:
        return
    
    splited_dir = base_dir / 'splited'
    output_dir = splited_dir / 'final_txt_format'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    conversions = [
        (splited_dir / 'train.csv', output_dir / 'train.txt', 'space_separated'),
        (splited_dir / 'valid.csv', output_dir / 'valid.txt', 'space_separated'),
        (splited_dir / 'test.csv', output_dir / 'test.txt', 'space_separated'),
        (base_dir / 'norm_IndovestDKG_encoded_entity_id.csv', output_dir / 'entity2id.txt', 'entity_mapping'),
        (base_dir / 'norm_IndovestDKG_relation_id.csv', output_dir / 'relation2id.txt', 'relation_mapping')
    ]
    
    for input_file, output_file, conversion_type in conversions:
        if not input_file.exists():
            continue
            
        if conversion_type == 'space_separated':
            df = pd.read_csv(input_file)
            df.to_csv(output_file, sep=' ', index=False, header=False)
        elif conversion_type == 'entity_mapping':
            df = pd.read_csv(input_file)
            with open(output_file, 'w', encoding='utf-8') as f:
                for _, row in df.iterrows():
                    f.write(f"{row['entity_name']}\t{row['entity_id']}\n")
        elif conversion_type == 'relation_mapping':
            df = pd.read_csv(input_file)
            with open(output_file, 'w', encoding='utf-8') as f:
                for _, row in df.iterrows():
                    f.write(f"{row['relation']}\t{row['relation_id']}\n")

def quick_convert():
    base = Path('./experiments/result')
    out = base / 'splited/final_txt_format'
    out.mkdir(parents=True, exist_ok=True)
    
    for split in ['train', 'valid', 'test']:
        pd.read_csv(base / f'splited/{split}.csv').to_csv(
            out / f'{split}.txt', sep=' ', index=False, header=False
        )
    
    pd.read_csv(base / 'norm_IndovestDKG_encoded_entity_id.csv').apply(
        lambda row: f"{row['entity_name']}\t{row['entity_id']}", axis=1
    ).to_csv(out / 'entity2id.txt', index=False, header=False, quoting=3)
    
    pd.read_csv(base / 'norm_IndovestDKG_relation_id.csv').apply(
        lambda row: f"{row['relation']}\t{row['relation_id']}", axis=1
    ).to_csv(out / 'relation2id.txt', index=False, header=False, quoting=3)

auto_detect_and_convert()