# 전처리

In [2]:
import json
from pathlib import Path
import os

raw_data_path = "/home/work/factchecking/PetQA/data/raw/filtered_medical_consultation_expert.json"
img_folder = Path("/home/work/factchecking/PetQA/data/processed/question_images")

with open(raw_data_path, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"총 게시글 수: {len(data):,}")
print(data[0].keys())

img_list = os.listdir(img_folder)
print(f"총 이미지 수: {len(img_list):,}")
print(img_list[0])


총 게시글 수: 12,645
dict_keys(['제목', '본문', 'question_video', 'tag_list', 'link', 'question_photo', 'question_date', 'answers', 'html_path'])
총 이미지 수: 20,153
818092f7-27ac-4c52-b69a-ba9d722a21c0.jpg


In [3]:
from collections import Counter

img_counts = []
for item in data:
    img_counts.append(len(item["question_photo"]))

count_distribution = Counter(img_counts)
print("샘플별 이미지 개수 분포:")
total = 0
for num_imgs, freq in sorted(count_distribution.items()):
    print(f"{num_imgs}개 이미지: {freq}개 샘플")
    total += freq
print(f"총 샘플 수: {total}")

샘플별 이미지 개수 분포:
1개 이미지: 7705개 샘플
2개 이미지: 3261개 샘플
3개 이미지: 1069개 샘플
4개 이미지: 357개 샘플
5개 이미지: 122개 샘플
6개 이미지: 62개 샘플
7개 이미지: 32개 샘플
8개 이미지: 15개 샘플
9개 이미지: 6개 샘플
10개 이미지: 3개 샘플
11개 이미지: 1개 샘플
12개 이미지: 3개 샘플
13개 이미지: 2개 샘플
16개 이미지: 2개 샘플
18개 이미지: 1개 샘플
23개 이미지: 2개 샘플
24개 이미지: 1개 샘플
42개 이미지: 1개 샘플
총 샘플 수: 12645


In [4]:
import numpy as np

# 질문당 이미지 개수 통계
img_counts_array = np.array(img_counts)

avg_imgs = np.mean(img_counts_array)
min_imgs = np.min(img_counts_array)
max_imgs = np.max(img_counts_array)

print(f"질문당 평균 이미지 수: {avg_imgs:.2f}")
print(f"질문당 최소 이미지 수: {min_imgs}")
print(f"질문당 최대 이미지 수: {max_imgs}")

질문당 평균 이미지 수: 1.62
질문당 최소 이미지 수: 1
질문당 최대 이미지 수: 42


In [5]:
from PIL import Image
import numpy as np

widths, heights = [], []

for item in data:
    # question_photos = ast.literal_eval(item["question_photo"])
    for img_name in item["question_photo"]:
        img_path = img_folder / f"{img_name}.jpg"
        if img_path.exists():
            with Image.open(img_path) as img:
                w, h = img.size
                widths.append(w)
                heights.append(h)

# 통계 출력
widths = np.array(widths)
heights = np.array(heights)

print(f"이미지 수: {len(widths)}")
print(f"너비 평균: {np.mean(widths):.1f}, 표준편차: {np.std(widths):.1f}, 최소~최대: {np.min(widths)}~{np.max(widths)}")
print(f"높이 평균: {np.mean(heights):.1f}, 표준편차: {np.std(heights):.1f}, 최소~최대: {np.min(heights)}~{np.max(heights)}")


이미지 수: 20153
너비 평균: 722.6, 표준편차: 79.8, 최소~최대: 19~750
높이 평균: 874.1, 표준편차: 307.7, 최소~최대: 19~4496


In [6]:
# 수의사 샘플 필터링
selected_expert_data = []
non_selected_expert_cnt = 0
for item in data:
    new_item = {
        "image": item["question_photo"][0],
        "title": item.get("제목", ""),
        "content": item.get("본문", ""),
        "answers": [],
    }
    for a_id, answer in enumerate(item["answers"]):
        if answer.get("expert_badge") != "수의사":
            continue
        if not answer.get("selected"):
            non_selected_expert_cnt += 1
            continue
        
        new_answer = {
            "a_id": a_id,
            "answer": answer["답변"],
        }
        new_item["answers"].append(new_answer)
    
    if new_item["answers"]:
        selected_expert_data.append(new_item)

print(f"채택된 수의사 답변 수: {len(selected_expert_data):,} ({len(selected_expert_data) / len(data):.2%})")
print(f"미채택 수의사 답변 수: {non_selected_expert_cnt:,} ({non_selected_expert_cnt / len(data):.2%})")

채택된 수의사 답변 수: 11,263 (89.07%)
미채택 수의사 답변 수: 1,384 (10.95%)


In [7]:
multi_expert_cnt = 0
for item in selected_expert_data:
    if len(item["answers"]) > 1:
        multi_expert_cnt += 1

print(f"여러 수의사 답변 수: {multi_expert_cnt:,} ({multi_expert_cnt / len(selected_expert_data):.2%})")

여러 수의사 답변 수: 0 (0.00%)


In [None]:
neutral_data_count = 0
null_data_count = 0
too_short_question_count = 0
too_short_answer_count = 0
deleted_cnt = 0  # 링크 접속이 안되는 http로 시작하는 이미지 제거

new_data = []
for id, item in enumerate(selected_expert_data):
    if len(item["title"] + " " + item["content"]) < 20:
        too_short_question_count += 1
        continue
    
    if len(item["answers"][0]["answer"]) < 20:
        too_short_answer_count += 1
        continue
    
    if item["image"].startswith("http"):
        deleted_cnt += 1
        continue
    
    if item["answers"][0]["answer"] == "":
        print(id)  # 빈 답변 확인인
        continue
    
    # if item.get("label_str") == "neutral":
    #     neutral_data_count += 1
    #     continue
    
    new_data.append({
        "id": id,
        "image": item["image"],
        "title": item["title"],
        "content": item["content"],
        "answer": item["answers"][0]["answer"],
    })

print(f"len(question) < 20인 샘플 수: {too_short_question_count:,}")
print(f"len(answer) < 20인 샘플 수: {too_short_answer_count:,}")
print(f"삭제된 게시글 수: {deleted_cnt:,}")
print(f"개, 고양이가 아닌 샘플 수: {neutral_data_count:,}")
print(f"{len(new_data):,}")

len(question) < 20인 샘플 수: 29
삭제된 게시글 수: 196
개, 고양이가 아닌 샘플 수: 0
11,038


In [9]:
# 중복(동일 게시글) 제거
seen_titles_contents = set()
deduplicated_data = []

for item in new_data:
    identifier = (item['title'], item['content'], item['image'])
    if identifier not in seen_titles_contents:
        seen_titles_contents.add(identifier)
        deduplicated_data.append(item)

print(f"중복된 샘플 수: {len(new_data) - len(deduplicated_data)}")
print(f"중복 제거 후 샘플 수: {len(deduplicated_data):,}")

중복된 샘플 수: 0
중복 제거 후 샘플 수: 11,038


In [17]:
import re

short_answer_cnt = 0
cleaned_data = []
URL_PATTERN = re.compile(r'https?://\S+')
def remove_urls(text: str) -> str:
    return URL_PATTERN.sub('', text)

def remove_common_greetings(text):
    if not isinstance(text, str):
        return ''
    # 유니코드 특수문자(제로폭 공백 등) 제거
    text = re.sub(r'[\u200b\u200c\u200d\u2060\ufeff]', '', text)
    patterns = [
        r'^안녕하세요.*?입니다\.?\s*',
        r'\s*감사합니다\.?\s*$',
        r'\s*고맙습니다\.?\s*$',
        r'\s*안녕히\s*계세요\.?\s*$',
        r'\s*안녕히\s*가세요\.?\s*$',
        r'\s*수고하세요\.?\s*$',
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.UNICODE).strip()
    return text

for item in deduplicated_data:
    answer = remove_urls(remove_common_greetings(item["answer"]))
    if len(answer) < 20:
        short_answer_cnt += 1
        continue
    cleaned_data.append({
        "id": item["id"],
        "image": item["image"],
        "title": remove_urls(remove_common_greetings(item["title"])),
        "content": remove_urls(remove_common_greetings(item["content"])),
        "answer": answer
    })

output_path = "/home/work/factchecking/PetQA/data/interim/preprocessed_multimodal_data.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print(f"짧은 답변 수: {short_answer_cnt:,}")
print(f"최종 샘플 수: {len(cleaned_data):,}")


짧은 답변 수: 5
최종 샘플 수: 11,033


In [15]:
short_answer = []
for item in cleaned_data:
    if len(item["answer"]) < 20:
        short_answer.append(item["id"])

print(f"짧은 답변 수: {len(short_answer):,}")
print(short_answer)

짧은 답변 수: 5
[1058, 2953, 6310, 6798, 9250]


# Train / Validation / Test Split

In [None]:
import json
file_path = "/home/work/factchecking/PetQA/data/interim/cleaned_data.json"
with open(file_path, "r", encoding="utf-8") as f:
    cleaned_data = json.load(f)
print(f"전처리 후 # QA pairs: {len(cleaned_data):,}")

file_path = "/home/work/factchecking/PetQA/src/preprocessing/extracted_data_w_badge.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)
print(len(data))
print(data[0])

selected_all_answer_type_q_ids = []
for item in data:
    selected_expert_flag = False
    selected_nonexpert_flag = False
    for answer in item["answers"]:
        if answer["answer_type"] == "expert" and answer["selected"]:
            selected_expert_flag = True
        if answer["answer_type"] == "nonexpert" and answer["selected"]:
            selected_nonexpert_flag = True
    if selected_expert_flag and selected_nonexpert_flag:
        selected_all_answer_type_q_ids.append(item["q_id"])
print(len(selected_all_answer_type_q_ids))

q_ids = []
for item in cleaned_data:
    if item["q_id"] in selected_all_answer_type_q_ids:
        q_ids.append(item["q_id"])
print(len(q_ids))

import os
processed_dir = "/home/work/factchecking/PetQA/data/processed"
train_file = os.path.join(processed_dir, "train.json")
validation_file = os.path.join(processed_dir, "validation.json")
test_file = os.path.join(processed_dir, "test.json")

with open(train_file, "r", encoding="utf-8") as f:
    train_data = json.load(f)
    print(len(train_data))
with open(validation_file, "r", encoding="utf-8") as f:
    validation_data = json.load(f)
    print(len(validation_data))
with open(test_file, "r", encoding="utf-8") as f:
    test_data = json.load(f)
    print(len(test_data))
    
new_train_data = []
for item in train_data:
    if item["q_id"] not in q_ids:
        new_train_data.append(item)

new_validation_data = []
for item in validation_data:
    if item["q_id"] not in q_ids:
        new_validation_data.append(item)

new_test_data = []
for item in test_data:
    if item["q_id"] not in q_ids:
        new_test_data.append(item)

print(len(new_train_data))
print(len(new_validation_data))
print(len(new_test_data))

new_train_path = os.path.join(processed_dir, "train.json")
with open(new_train_path, "w", encoding="utf-8") as f:
    json.dump(new_train_data, f, ensure_ascii=False, indent=2)

new_validation_path = os.path.join(processed_dir, "validation.json")
with open(new_validation_path, "w", encoding="utf-8") as f:
    json.dump(new_validation_data, f, ensure_ascii=False, indent=2)

new_test_path = os.path.join(processed_dir, "test.json")
with open(new_test_path, "w", encoding="utf-8") as f:
    json.dump(new_test_data, f, ensure_ascii=False, indent=2)

In [18]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import Counter
cleaned_data_path = "/home/work/factchecking/PetQA/data/interim/cleaned_data.json"
output_dir = "/home/work/factchecking/PetQA/data/processed"

with open(cleaned_data_path, "r") as f:
    data = json.load(f)
    
total_count = len(data)
print(f"전체 데이터 크기: {total_count}")
print("-"*50)

stratify_labels = []
for item in data:
    category = f"{item['animal_type']}-{item['answer_type']}"
    stratify_labels.append(category)
    
category_counts = Counter(stratify_labels)
print("원본 데이터 카테고리 분포:")
for category, count in category_counts.items():
    percentage = (count / total_count) * 100
    print(f"{category}: {count}개 ({percentage:.1f}%)")
print("-"*50)

train_data, temp_data, train_labels, temp_labels = train_test_split(
    data, 
    stratify_labels,
    test_size=20000,  # val + test
    train_size=total_count - 20000,  # train
    stratify=stratify_labels,
    random_state=42
)

val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data,
    temp_labels,
    test_size=0.5,  # temp의 절반씩
    stratify=temp_labels,
    random_state=42
)

print(f"분할 결과:")
print(f"Train: {len(train_data)}개")
print(f"Validation: {len(val_data)}개") 
print(f"Test: {len(test_data)}개")
print("-"*50)

전체 데이터 크기: 55393
--------------------------------------------------
원본 데이터 카테고리 분포:
cat-expert: 2289개 (4.1%)
dog-expert: 11535개 (20.8%)
cat-nonexpert: 16402개 (29.6%)
dog-nonexpert: 25167개 (45.4%)
--------------------------------------------------
분할 결과:
Train: 35393개
Validation: 10000개
Test: 10000개
--------------------------------------------------


In [19]:
def print_distribution(data_labels, dataset_name):
    category_counts = Counter(data_labels)
    total = len(data_labels)
    print(f"{dataset_name} 카테고리 분포:")
    for category, count in sorted(category_counts.items()):
        percentage = (count / total) * 100
        print(f"{category}: {count}개 ({percentage:.1f}%)")
    print("-"*50)

print_distribution(train_labels, "Train")
print_distribution(val_labels, "Validation") 
print_distribution(test_labels, "Test")

Train 카테고리 분포:
cat-expert: 1463개 (4.1%)
cat-nonexpert: 10480개 (29.6%)
dog-expert: 7370개 (20.8%)
dog-nonexpert: 16080개 (45.4%)
--------------------------------------------------
Validation 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2082개 (20.8%)
dog-nonexpert: 4544개 (45.4%)
--------------------------------------------------
Test 카테고리 분포:
cat-expert: 413개 (4.1%)
cat-nonexpert: 2961개 (29.6%)
dog-expert: 2083개 (20.8%)
dog-nonexpert: 4543개 (45.4%)
--------------------------------------------------


In [20]:
datasets = {
    'train': train_data,
    'validation': val_data, 
    'test': test_data
}

for split_name, split_data in datasets.items():
    output_path = os.path.join(output_dir, f"{split_name}.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(split_data, f, ensure_ascii=False, indent=2)
    print(f"{split_name}.json 저장 완료: {output_path}")

train.json 저장 완료: /home/work/factchecking/PetQA/data/processed/train.json
val.json 저장 완료: /home/work/factchecking/PetQA/data/processed/val.json
test.json 저장 완료: /home/work/factchecking/PetQA/data/processed/test.json


In [21]:
# 분포 확인
all_data = train_data + val_data + test_data
    
def get_category_distribution(data):
    categories = [f"{item['animal_type']}-{item['answer_type']}" for item in data]
    counter = Counter(categories)
    total = len(data)
    return {cat: count/total for cat, count in counter.items()}

original_dist = get_category_distribution(all_data)
train_dist = get_category_distribution(train_data)
val_dist = get_category_distribution(val_data)
test_dist = get_category_distribution(test_data)

print(f"{'Category':<20} {'Original':<10} {'Train':<10} {'Val':<10} {'Test':<10}")
print("-" * 65)

for category in sorted(original_dist.keys()):
    print(f"{category:<20} {original_dist[category]:.3f}     {train_dist[category]:.3f}     {val_dist[category]:.3f}     {test_dist[category]:.3f}")


Category             Original   Train      Val        Test      
-----------------------------------------------------------------
cat-expert           0.041     0.041     0.041     0.041
cat-nonexpert        0.296     0.296     0.296     0.296
dog-expert           0.208     0.208     0.208     0.208
dog-nonexpert        0.454     0.454     0.454     0.454
