Train 데이터를 Paragraph 단위로 자른 후 Knowledge base 가 큰 LLM 으로 relabeling 

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from tqdm import tqdm
import random

In [2]:
data_base = "../data"

In [None]:
train_df = pd.read_csv(f"{data_base}/train.csv")
train_df.head()

In [None]:
checkpoint_path = f"../labeling_ckpt/checkpoint-200000"  # 원하는 checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

In [5]:
def split_sentences(text):
    return [s.strip() for s in text.split('\n') if s.strip()]


def response_postprocessing(decoded):
    # 공백 제거
    decoded = decoded.strip()

    # 뒤쪽 100자만 잘라서 보면 속도도 빠르고 의미도 보장됨
    tail = decoded[-100:].upper()  # 대소문자 구분 없게

    if re.search(r'\bAI\b', tail[::-1]):
        return 1
    elif re.search(r'\bHUMAN\b', tail[::-1]):
        return 0
    else:
        return 0.5

In [6]:
def make(sentence, tokenizer, model):
    valid_keys = {"input_ids", "attention_mask"}

    inputs = tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=512,
            stride=256,  # ✅ 겹치게 자름
            return_overflowing_tokens=True,
            return_tensors="pt"
        )
    
    # segment 중 하나 랜덤 선택
    n_segments = inputs["input_ids"].size(0)
    seg_idx = random.randint(0, n_segments - 1)

    item = {
        k: v[seg_idx] for k, v in inputs.items() if k != "overflow_to_sample_mapping"
    }

    inputs = {k: v.unsqueeze(0).to(model.device) for k, v in item.items() if k in valid_keys}

    # Longformer는 token_type_ids 없음
    inputs.pop("token_type_ids", None)

    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)

    return preds

In [None]:
output_csv = f"{data_base}/pseudo_labeling.csv"
# 초기 딕셔너리
parsing_df = {
    "title": [],
    "paragraph_idx": [],
    "paragraph": [],
    "paragraph_label": [],
    "document_label": []
}

for _, row in tqdm(train_df.iterrows(), desc="processing", total=len(train_df)):
    title = row["title"]
    text = row["full_text"]
    document_label = row["generated"]

    split_text = split_sentences(text)

    for idx, paragraph in enumerate(split_text):
        if document_label == 0:
            paragraph_label = 0
        else:
            paragraph_label = make(paragraph, tokenizer, model).cpu().item()

        # 딕셔너리에 추가
        parsing_df["title"].append(title)
        parsing_df["paragraph_idx"].append(idx)
        parsing_df["paragraph"].append(paragraph)
        parsing_df["paragraph_label"].append(paragraph_label)
        parsing_df["document_label"].append(document_label)

# 최종 DataFrame 생성
parsing_df = pd.DataFrame(parsing_df)
parsing_df.to_csv(f"{output_csv}", index=False)

In [None]:
train_csv = pd.read_csv(output_csv)
train_csv = train_csv.rename(columns={
    'paragraph': 'full_text'
    })

train_csv.head()

In [None]:
print(f"데이터 개수 : {len(train_csv)}")
avg_text = sum([len(i) for i in train_csv["full_text"]])/len(train_csv)
print(f"평균 text 길이 : {avg_text}")
max_length = max([len(i) for i in train_csv["full_text"]])
min_length = min([len(i) for i in train_csv["full_text"]])
print(f"가장 길이가 긴거 : {max_length}")
print(f"가장 길이가 짧은거 : {min_length}")

In [None]:
print(f"클래스 비율 : {sum(train_csv['paragraph_label'])/len(train_csv)}")
print(f"가중치 : {len(train_csv)/sum(train_csv['paragraph_label'])}")

In [None]:
filter_df = train_csv[train_csv['document_label'] == train_csv['paragraph_label']]

list_1_1 = []
for _,i in filter_df.iterrows():
    if i['paragraph_label'] == 1:
        list_1_1.append(i)

print(f"1 == 1 : {len(list_1_1)}")
print(f"0 == 1 : {len(train_csv[train_csv['document_label'] != train_csv['paragraph_label']])}")

In [None]:
# 1. 먼저 generated == 1 인 row만 필터링
gen1_df = train_csv[train_csv["document_label"] == 1]

# 2. 그 중 title 별로 paragraph_label 평균 계산
grouped = gen1_df.groupby("title")["paragraph_label"].mean()

# 3. 평균이 정확히 0인 title만 선택
zero_titles = grouped[grouped == 0].index.tolist()

print(f"paragraph_label 평균이 0인 title 개수: {len(zero_titles)}")
print("예시:", zero_titles[:10])

# 전체 title 수 (중복 제거)
total_titles = train_csv["title"].nunique()

# generated == 1 인 title만 필터링
generated_1_titles = train_csv[train_csv["document_label"] == 1]["title"].unique()

# 개수와 비율 계산
num_generated_1 = len(zero_titles)
ratio = num_generated_1 / total_titles

print(f"전체 title 수: {total_titles}")
print(f"generated == 1 인 title 수: {num_generated_1}")
print(f"비율: {ratio:.4f} ({ratio*100:.2f}%)")


In [None]:
filtered_df = train_csv[~train_csv["title"].isin(zero_titles)]
filtered_df.to_csv(output_csv, index=False)
