In [None]:
import pandas as pd
import re
from pathlib import Path

base_path = Path("../data")
aug_path = Path("./augmentation_data")

# aug_ 또는 gemma_로 시작하는 파일
target_files = sorted([
    f for f in aug_path.glob("*.csv")
    if re.match(r"^(aug_|gemma_|).*\.csv$", f.name)
])

print("🔍 탐색된 파일:")
for f in target_files:
    print(f" - {f.name}")

base_df = pd.read_csv(base_path / "pseudo_labeling.csv")

base_df = base_df.rename(columns={
    'paragraph': 'full_text',
    'paragraph_label': 'generated'
    })

aug_dfs = [pd.read_csv(f) for f in target_files]
aug_df = pd.concat(aug_dfs, ignore_index=True)

merged_df = pd.concat([base_df, aug_df], ignore_index=True).reset_index(drop=True)
merged_df.to_csv(base_path / "final_aug_train.csv", index=False)

def print_stats(df, title):
    counts = df['generated'].value_counts().sort_index()
    print(f"{title}: 총 {len(df)}개")
    print(counts.rename(lambda x: f"label {x}"))
    print()

# --- 출력 ---
print("=========================================")
print_stats(base_df, "기존 데이터")
print_stats(aug_df,  "추가된 증강 데이터")
print_stats(merged_df, "병합 후 전체")


In [None]:
import pandas as pd
from pathlib import Path

# --- 경로 설정 ---
save_path = base_path / "final_aug_train.csv"

merged_df = pd.read_csv(base_path / "final_aug_train.csv")

def print_length_stats(df, title):
    lengths = df["full_text"].str.len()
    print(f"{title}")
    print(f"데이터 개수       : {len(df)}")
    print(f"평균 text 길이    : {lengths.mean():.2f}")
    print(f"가장 긴 text 길이 : {lengths.max()}")
    print(f"가장 짧은 text 길이 : {lengths.min()}\n")


print_length_stats(merged_df, "원본 merged_df 통계")

filtered_df = merged_df.copy()
filtered_df["text_length"] = filtered_df["full_text"].str.len()
filtered_df = filtered_df[~filtered_df["text_length"].between(5, 29)].drop(columns=["text_length"])


filtered_df.to_csv(save_path, index=False)

print("======================================")
print_length_stats(filtered_df, "필터링 후 통계")