# 사전 작업

In [1]:
import json
import os

raw_dir_path = "/home/work/factchecking/PetQA/data/raw/"
interim_dir_path = "/home/work/factchecking/PetQA/data/interim/"
file_list = [f for f in os.listdir(raw_dir_path) if f.endswith('.json')]

6개의 raw 파일 -> 하나의 파일로 병합

In [2]:
merged_data_path = os.path.join(interim_dir_path, "merged_data.json")
merged_data = []
for file_name in file_list:
    file_path = os.path.join(raw_dir_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    merged_data.extend(data)

with open(merged_data_path, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

print(f"총 게시글의 수: {len(merged_data)}")

총 게시글의 수: 63376


피쳐 추출

In [5]:
import re
extracted_data = []
neutral_data_count = 0
null_data_count = 0
NULL_STR = "본 게시물은 개인정보노출 또는 서비스 운영원칙에 위배된 내용이 포함되어 삭제되었습니다."

URL_PATTERN = re.compile(r'https?://\S+')
def remove_urls(text: str) -> str:
    return URL_PATTERN.sub('', text)

for item in merged_data:
    if item.get("label_str") == "neutral":
        neutral_data_count += 1
        continue
    if item.get("제목") == NULL_STR:
        null_data_count += 1
        continue
    
    new_item = {
        "title": remove_urls(item.get("제목", "")),
        "content": remove_urls(item.get("본문", "")),
        "answers": [],
        "question_date": item.get("question_date"),
        "animal_type": item.get("label_str"),
        "link": item.get("link")
    }
    
    for a_id, answer in enumerate(item["answers"]):
        if answer.get("답변") == "":
            continue
        
        answer_type = "expert" if answer.get("expert_badge") == "수의사" else "nonexpert"
        new_answer = {
            "a_id": a_id,
            "answer_type": answer_type,
            "answer": remove_urls(answer.get("답변", "")),
            "selected": answer.get("selected"),
            "name": answer.get("name"),
            "badge": answer.get("badge"),
            "answer_date": answer.get("answer_date")
        }
        new_item["answers"].append(new_answer)
    
    if new_item["answers"]:
        extracted_data.append(new_item)
        
seen_titles_contents = set()
duplicated_data = 0
q_id_count = 0
unique_data = []

for item in extracted_data:
    identifier = (item['title'], item['content'])
    if identifier in seen_titles_contents:
        duplicated_data += 1
    else:
        seen_titles_contents.add(identifier)
        new_item = {"q_id": q_id_count}
        new_item.update(item)
        unique_data.append(new_item)
        q_id_count += 1

output_path = os.path.join(interim_dir_path, "preprocessed_data.json")
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(unique_data, f, ensure_ascii=False, indent=2)
            
print(f"neutral data count: {neutral_data_count}")
print(f"null data count: {null_data_count}")
print(f"duplicated data count: {duplicated_data}")
print(f"file_path: {output_path}")
print(f"Total data count: {len(unique_data):,}")

neutral data count: 1351
null data count: 7
duplicated data count: 193
file_path: /home/work/factchecking/PetQA/data/interim/preprocessed_data.json
Total data count: 61,825


In [5]:
import json
import re
import os

URL_PATTERN = re.compile(r'https?://\S+')

def remove_urls(text: str) -> str:
    return URL_PATTERN.sub('', text)

def preprocess_and_filter(data):
    cleaned_data = []
    for item in data:
        item["title"] = remove_urls(item["title"])
        item["content"] = remove_urls(item["content"])
        item["answer"] = remove_urls(item["answer"])
        item["preprocessed_question"] = remove_urls(item["preprocessed_question"])
        item["preprocessed_answer"] = remove_urls(item["preprocessed_answer"])
        
        if item["answer"].strip() and item["preprocessed_answer"].strip():
            cleaned_data.append({
                "q_id": item["q_id"],
                "title": item["title"],
                "content": item["content"],
                "answer": item["answer"],
                "a_id": item["a_id"],
                "answer_type": item["answer_type"],
                "question_date": item["question_date"],
                "animal_type": item["animal_type"],
                "preprocessed_question": item["preprocessed_question"],
                "preprocessed_answer": item["preprocessed_answer"],
            })
    return cleaned_data

train_path = "/home/work/factchecking/PetQA/data/processed/train.json"
validation_path = "/home/work/factchecking/PetQA/data/processed/validation.json"
test_path = "/home/work/factchecking/PetQA/data/processed/test.json"

with open(train_path, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(validation_path, 'r', encoding='utf-8') as f:
    validation_data = json.load(f)

with open(test_path, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

new_train_data = preprocess_and_filter(train_data)
new_validation_data = preprocess_and_filter(validation_data)
new_test_data = preprocess_and_filter(test_data)

with open(train_path, 'w', encoding='utf-8') as f:
    json.dump(new_train_data, f, ensure_ascii=False, indent=2)

with open(validation_path, 'w', encoding='utf-8') as f:
    json.dump(new_validation_data, f, ensure_ascii=False, indent=2)

with open(test_path, 'w', encoding='utf-8') as f:
    json.dump(new_test_data, f, ensure_ascii=False, indent=2)

In [4]:

# 등급별 일반인 분포를 확인 목적
# merged_data -> extracted_data_w_badge.json
import json
extracted_data = []
NULL_STR = "본 게시물은 개인정보노출 또는 서비스 운영원칙에 위배된 내용이 포함되어 삭제되었습니다."

with open("/home/work/factchecking/PetQA/data/interim/merged_data.json", 'r', encoding='utf-8') as f:
    merged_data = json.load(f)


        
q_id_count = 0
for item in merged_data:
    if item.get("label_str") == "neutral":
        continue
    if item.get("제목") == NULL_STR:
        continue
    
    new_item = {
        "q_id": q_id_count,
        "title": item.get("제목"),
        "content": item.get("본문", ""),
        "answers": [],
        "question_date": item.get("question_date"),
        "animal_type": item.get("label_str"),
        "link": item.get("link")
    }
    q_id_count += 1
    
    for a_id, answer in enumerate(item["answers"]):
        if answer.get("답변") == "":
            continue
        
        answer_type = "expert" if answer.get("expert_badge") == "수의사" else "nonexpert"
        new_answer = {
            "a_id": a_id,
            "answer_type": answer_type,
            "answer": answer.get("답변"),
            "badge": answer.get("badge"),
            "selected": answer.get("selected"),
            "answer_date": answer.get("answer_date")
        }
        new_item["answers"].append(new_answer)
    extracted_data.append(new_item)
    
seen_titles_contents = set()
duplicated_data = 0
unique_data = []

for item in extracted_data:
    identifier = (item['title'], item['content'])
    if identifier in seen_titles_contents:
        duplicated_data += 1
    else:
        seen_titles_contents.add(identifier)
        new_item = {"q_id": q_id_count}
        new_item.update(item)
        unique_data.append(new_item)

print(f"샘플 수: {len(unique_data)}")
# with open("./extracted_data_w_badge.json", 'w', encoding='utf-8') as f:
#     json.dump(unique_data, f, ensure_ascii=False, indent=2)

샘플 수: 61825


# Train / Validation / Test Split

In [None]:
import json
file_path = "/home/work/factchecking/PetQA/data/interim/cleaned_data.json"
with open(file_path, "r", encoding="utf-8") as f:
    cleaned_data = json.load(f)
print(f"전처리 후 # QA pairs: {len(cleaned_data):,}")

file_path = "/home/work/factchecking/PetQA/src/preprocessing/extracted_data_w_badge.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)
print(len(data))
print(data[0])

selected_all_answer_type_q_ids = []
for item in data:
    selected_expert_flag = False
    selected_nonexpert_flag = False
    for answer in item["answers"]:
        if answer["answer_type"] == "expert" and answer["selected"]:
            selected_expert_flag = True
        if answer["answer_type"] == "nonexpert" and answer["selected"]:
            selected_nonexpert_flag = True
    if selected_expert_flag and selected_nonexpert_flag:
        selected_all_answer_type_q_ids.append(item["q_id"])
print(len(selected_all_answer_type_q_ids))

q_ids = []
for item in cleaned_data:
    if item["q_id"] in selected_all_answer_type_q_ids:
        q_ids.append(item["q_id"])
print(len(q_ids))

import os
processed_dir = "/home/work/factchecking/PetQA/data/processed"
train_file = os.path.join(processed_dir, "train.json")
validation_file = os.path.join(processed_dir, "validation.json")
test_file = os.path.join(processed_dir, "test.json")

with open(train_file, "r", encoding="utf-8") as f:
    train_data = json.load(f)
    print(len(train_data))
with open(validation_file, "r", encoding="utf-8") as f:
    validation_data = json.load(f)
    print(len(validation_data))
with open(test_file, "r", encoding="utf-8") as f:
    test_data = json.load(f)
    print(len(test_data))
    
new_train_data = []
for item in train_data:
    if item["q_id"] not in q_ids:
        new_train_data.append(item)

new_validation_data = []
for item in validation_data:
    if item["q_id"] not in q_ids:
        new_validation_data.append(item)

new_test_data = []
for item in test_data:
    if item["q_id"] not in q_ids:
        new_test_data.append(item)

print(len(new_train_data))
print(len(new_validation_data))
print(len(new_test_data))

new_train_path = os.path.join(processed_dir, "train.json")
with open(new_train_path, "w", encoding="utf-8") as f:
    json.dump(new_train_data, f, ensure_ascii=False, indent=2)

new_validation_path = os.path.join(processed_dir, "validation.json")
with open(new_validation_path, "w", encoding="utf-8") as f:
    json.dump(new_validation_data, f, ensure_ascii=False, indent=2)

new_test_path = os.path.join(processed_dir, "test.json")
with open(new_test_path, "w", encoding="utf-8") as f:
    json.dump(new_test_data, f, ensure_ascii=False, indent=2)

In [18]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import Counter
cleaned_data_path = "/home/work/factchecking/PetQA/data/interim/cleaned_data.json"
output_dir = "/home/work/factchecking/PetQA/data/processed"

with open(cleaned_data_path, "r") as f:
    data = json.load(f)
    
total_count = len(data)
print(f"전체 데이터 크기: {total_count}")
print("-"*50)

stratify_labels = []
for item in data:
    category = f"{item['animal_type']}-{item['answer_type']}"
    stratify_labels.append(category)
    
category_counts = Counter(stratify_labels)
print("원본 데이터 카테고리 분포:")
for category, count in category_counts.items():
    percentage = (count / total_count) * 100
    print(f"{category}: {count}개 ({percentage:.1f}%)")
print("-"*50)

train_data, temp_data, train_labels, temp_labels = train_test_split(
    data, 
    stratify_labels,
    test_size=20000,  # val + test
    train_size=total_count - 20000,  # train
    stratify=stratify_labels,
    random_state=42
)

val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data,
    temp_labels,
    test_size=0.5,  # temp의 절반씩
    stratify=temp_labels,
    random_state=42
)

print(f"분할 결과:")
print(f"Train: {len(train_data)}개")
print(f"Validation: {len(val_data)}개") 
print(f"Test: {len(test_data)}개")
print("-"*50)

전체 데이터 크기: 55393
--------------------------------------------------
원본 데이터 카테고리 분포:
cat-expert: 2289개 (4.1%)
dog-expert: 11535개 (20.8%)
cat-nonexpert: 16402개 (29.6%)
dog-nonexpert: 25167개 (45.4%)
--------------------------------------------------
분할 결과:
Train: 35393개
Validation: 10000개
Test: 10000개
--------------------------------------------------


In [19]:
def print_distribution(data_labels, dataset_name):
    category_counts = Counter(data_labels)
    total = len(data_labels)
    print(f"{dataset_name} 카테고리 분포:")
    for category, count in sorted(category_counts.items()):
        percentage = (count / total) * 100
        print(f"{category}: {count}개 ({percentage:.1f}%)")
    print("-"*50)

print_distribution(train_labels, "Train")
print_distribution(val_labels, "Validation") 
print_distribution(test_labels, "Test")

Train 카테고리 분포:
cat-expert: 1463개 (4.1%)
cat-nonexpert: 10480개 (29.6%)
dog-expert: 7370개 (20.8%)
dog-nonexpert: 16080개 (45.4%)
--------------------------------------------------
Validation 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2082개 (20.8%)
dog-nonexpert: 4544개 (45.4%)
--------------------------------------------------
Test 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2083개 (20.8%)
dog-nonexpert: 4543개 (45.4%)
--------------------------------------------------


In [20]:
datasets = {
    'train': train_data,
    'validation': val_data, 
    'test': test_data
}

for split_name, split_data in datasets.items():
    output_path = os.path.join(output_dir, f"{split_name}.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(split_data, f, ensure_ascii=False, indent=2)
    print(f"{split_name}.json 저장 완료: {output_path}")

train.json 저장 완료: /home/work/factchecking/PetQA/data/processed/train.json
val.json 저장 완료: /home/work/factchecking/PetQA/data/processed/val.json
test.json 저장 완료: /home/work/factchecking/PetQA/data/processed/test.json


In [21]:
# 분포 확인
all_data = train_data + val_data + test_data
    
def get_category_distribution(data):
    categories = [f"{item['animal_type']}-{item['answer_type']}" for item in data]
    counter = Counter(categories)
    total = len(data)
    return {cat: count/total for cat, count in counter.items()}

original_dist = get_category_distribution(all_data)
train_dist = get_category_distribution(train_data)
val_dist = get_category_distribution(val_data)
test_dist = get_category_distribution(test_data)

print(f"{'Category':<20} {'Original':<10} {'Train':<10} {'Val':<10} {'Test':<10}")
print("-" * 65)

for category in sorted(original_dist.keys()):
    print(f"{category:<20} {original_dist[category]:.3f}     {train_dist[category]:.3f}     {val_dist[category]:.3f}     {test_dist[category]:.3f}")


Category             Original   Train      Val        Test      
-----------------------------------------------------------------
cat-expert           0.041     0.041     0.041     0.041
cat-nonexpert        0.296     0.296     0.296     0.296
dog-expert           0.208     0.208     0.208     0.208
dog-nonexpert        0.454     0.454     0.454     0.454
