# 사전 작업

In [2]:
import json
import os

raw_dir_path = "/home/work/factchecking/PetQA/data/raw/"
interim_dir_path = "/home/work/factchecking/PetQA/data/interim/"
file_list = [f for f in os.listdir(raw_dir_path) if f.endswith('.json')]

6개의 raw 파일 -> 하나의 파일로 병합

In [6]:
merged_data_path = os.path.join(interim_dir_path, "merged_data.json")
merged_data = []
for file_name in file_list:
    file_path = os.path.join(raw_dir_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    merged_data.extend(data)

with open(merged_data_path, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print(f"파일 병합 완료: {merged_data_path}")
print(f"총 샘플의 수: {len(merged_data)}")

파일 병합 완료: /home/work/factchecking/PetQA/data/interim/merged_data.json
총 샘플의 수: 63376


피쳐 추출

In [10]:
extracted_data_path = os.path.join(interim_dir_path, "extracted_data.json")
extracted_data = []
neutral_data_count = 0
null_data_count = 0
NULL_STR = "본 게시물은 개인정보노출 또는 서비스 운영원칙에 위배된 내용이 포함되어 삭제되었습니다."

with open(merged_data_path, 'r', encoding='utf-8') as f:
    merged_data = json.load(f)

for item in merged_data:
    if item.get("label_str") == "neutral":
        neutral_data_count += 1
        continue
    if item.get("제목") == NULL_STR:
        null_data_count += 1
        continue
    
    new_item = {
        "title": item.get("제목"),
        "content": item.get("본문", ""),
        "answers": [],
        "question_date": item.get("question_date"),
        "animal_type": item.get("label_str"),
        "link": item.get("link")
    }
    
    for a_id, answer in enumerate(item["answers"]):
        answer_type = "expert" if answer.get("expert_badge") == "수의사" else "nonexpert"
        new_answer = {
            "a_id": a_id,
            "answer_type": answer_type,
            "answer": answer.get("답변"),
            "selected": answer.get("selected"),
            "answer_date": answer.get("answer_date")
        }
        new_item["answers"].append(new_answer)
    extracted_data.append(new_item)

with open(extracted_data_path, 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=2)
            
print(f"추출 완료: {extracted_data_path}")
print(f"중립 데이터 수: {neutral_data_count}")
print(f"삭제된 데이터 수: {null_data_count}")
print(f"추출된 샘플 수: {len(extracted_data)}")

추출 완료: /home/work/factchecking/PetQA/data/interim/extracted_data.json
중립 데이터 수: 1351
삭제된 데이터 수: 7
추출된 샘플 수: 62018


중복 제거

In [12]:
unique_data_path = os.path.join(interim_dir_path, "unique_data.json")
with open(extracted_data_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

seen_titles_contents = set()
duplicated_data = 0
unique_data = []
q_id_count = 0

for item in data:
    identifier = (item['title'], item['content'])
    if identifier in seen_titles_contents:
        duplicated_data += 1
    else:
        seen_titles_contents.add(identifier)
        new_item = {"q_id": q_id_count}
        new_item.update(item)
        unique_data.append(new_item)
        q_id_count += 1
        
with open(unique_data_path, 'w', encoding='utf-8') as f:
    json.dump(unique_data, f, ensure_ascii=False, indent=2)

print(f"중복된 데이터: {duplicated_data}개")
print(f"고유한 샘플 수: {len(unique_data)}개")

중복된 데이터: 193개
고유한 샘플 수: 61825개


# Train / Validation / Test Split

In [18]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import Counter
cleaned_data_path = "/home/work/factchecking/PetQA/data/interim/cleaned_data.json"
output_dir = "/home/work/factchecking/PetQA/data/processed"

with open(cleaned_data_path, "r") as f:
    data = json.load(f)
    
total_count = len(data)
print(f"전체 데이터 크기: {total_count}")
print("-"*50)

stratify_labels = []
for item in data:
    category = f"{item['animal_type']}-{item['answer_type']}"
    stratify_labels.append(category)
    
category_counts = Counter(stratify_labels)
print("원본 데이터 카테고리 분포:")
for category, count in category_counts.items():
    percentage = (count / total_count) * 100
    print(f"{category}: {count}개 ({percentage:.1f}%)")
print("-"*50)

train_data, temp_data, train_labels, temp_labels = train_test_split(
    data, 
    stratify_labels,
    test_size=20000,  # val + test
    train_size=total_count - 20000,  # train
    stratify=stratify_labels,
    random_state=42
)

val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data,
    temp_labels,
    test_size=0.5,  # temp의 절반씩
    stratify=temp_labels,
    random_state=42
)

print(f"분할 결과:")
print(f"Train: {len(train_data)}개")
print(f"Validation: {len(val_data)}개") 
print(f"Test: {len(test_data)}개")
print("-"*50)

전체 데이터 크기: 55393
--------------------------------------------------
원본 데이터 카테고리 분포:
cat-expert: 2289개 (4.1%)
dog-expert: 11535개 (20.8%)
cat-nonexpert: 16402개 (29.6%)
dog-nonexpert: 25167개 (45.4%)
--------------------------------------------------
분할 결과:
Train: 35393개
Validation: 10000개
Test: 10000개
--------------------------------------------------


In [19]:
def print_distribution(data_labels, dataset_name):
    category_counts = Counter(data_labels)
    total = len(data_labels)
    print(f"{dataset_name} 카테고리 분포:")
    for category, count in sorted(category_counts.items()):
        percentage = (count / total) * 100
        print(f"{category}: {count}개 ({percentage:.1f}%)")
    print("-"*50)

print_distribution(train_labels, "Train")
print_distribution(val_labels, "Validation") 
print_distribution(test_labels, "Test")

Train 카테고리 분포:
cat-expert: 1463개 (4.1%)
cat-nonexpert: 10480개 (29.6%)
dog-expert: 7370개 (20.8%)
dog-nonexpert: 16080개 (45.4%)
--------------------------------------------------
Validation 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2082개 (20.8%)
dog-nonexpert: 4544개 (45.4%)
--------------------------------------------------
Test 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2083개 (20.8%)
dog-nonexpert: 4543개 (45.4%)
--------------------------------------------------


In [20]:
datasets = {
    'train': train_data,
    'validation': val_data, 
    'test': test_data
}

for split_name, split_data in datasets.items():
    output_path = os.path.join(output_dir, f"{split_name}.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(split_data, f, ensure_ascii=False, indent=2)
    print(f"{split_name}.json 저장 완료: {output_path}")

train.json 저장 완료: /home/work/factchecking/PetQA/data/processed/train.json
val.json 저장 완료: /home/work/factchecking/PetQA/data/processed/val.json
test.json 저장 완료: /home/work/factchecking/PetQA/data/processed/test.json


In [21]:
# 분포 확인
all_data = train_data + val_data + test_data
    
def get_category_distribution(data):
    categories = [f"{item['animal_type']}-{item['answer_type']}" for item in data]
    counter = Counter(categories)
    total = len(data)
    return {cat: count/total for cat, count in counter.items()}

original_dist = get_category_distribution(all_data)
train_dist = get_category_distribution(train_data)
val_dist = get_category_distribution(val_data)
test_dist = get_category_distribution(test_data)

print(f"{'Category':<20} {'Original':<10} {'Train':<10} {'Val':<10} {'Test':<10}")
print("-" * 65)

for category in sorted(original_dist.keys()):
    print(f"{category:<20} {original_dist[category]:.3f}     {train_dist[category]:.3f}     {val_dist[category]:.3f}     {test_dist[category]:.3f}")


Category             Original   Train      Val        Test      
-----------------------------------------------------------------
cat-expert           0.041     0.041     0.041     0.041
cat-nonexpert        0.296     0.296     0.296     0.296
dog-expert           0.208     0.208     0.208     0.208
dog-nonexpert        0.454     0.454     0.454     0.454
