In [1]:
import os, re, string, sys, subprocess, random, gc
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def _pip_install(pkg):
    try:
        __import__(pkg.split("==")[0].replace("-", "_"))
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

for pkg in ["torch", "tqdm", "scikit-learn", "summa==1.2.0"]:
    _pip_install(pkg)

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from summa.summarizer import summarize as summa_summarize

In [4]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
CSV_PATH = "/content/News_Summary.csv"

df = pd.read_csv(CSV_PATH, encoding="utf-8", low_memory=False)

In [6]:
src_candidates = [c for c in df.columns if c.lower() in ["ctext", "text", "article", "articles"]]
tgt_candidates = [c for c in df.columns if c.lower() in ["headlines", "headline", "summary", "title"]]

if not src_candidates or not tgt_candidates:
    raise ValueError(f"본문/요약 컬럼을 찾지 못함. 본문 후보:{df.columns.tolist()} 중 'ctext'/'text', 요약은 'headlines'/'summary' 필요")

SRC_COL, TGT_COL = src_candidates[0], tgt_candidates[0]

df[SRC_COL] = df[SRC_COL].astype(str).str.strip()
df[TGT_COL] = df[TGT_COL].astype(str).str.strip()
df = df.replace({"None": np.nan, "nan": np.nan}).dropna(subset=[SRC_COL, TGT_COL]).reset_index(drop=True)

In [7]:
def _wc(s): return len(s.split())
print(f"[분석] 본문 평균 길이: {df[SRC_COL].map(_wc).mean():.1f}, 요약 평균 길이: {df[TGT_COL].map(_wc).mean():.1f}, 샘플: {len(df)}")

[분석] 본문 평균 길이: 58.2, 요약 평균 길이: 9.6, 샘플: 98401


In [8]:
EN_STOPS = set("""
a about above after again against all am an and any are aren't as at be because been
before being below between both but by can't cannot could couldn't did didn't do does
doesn't doing don't down during each few for from further had hadn't has hasn't have
haven't having he he'd he'll he's her here here's hers herself him himself his how
how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most
mustn't my myself no nor not of off on once only or other ought our ours ourselves
out over own same shan't she she'd she'll she's should shouldn't so some such than
that that's the their theirs them themselves then there there's these they they'd
they'll they're they've this those through to too under until up very was wasn't we
we'd we'll we're we've were weren't what what's when when's where where's which while
who who's whom why why's with won't would wouldn't you you'd you'll you're you've your
yours yourself yourselves
""".split())

PUNCT_TABLE = str.maketrans("", "", string.punctuation)

In [9]:
def simple_tokenize(text: str):
    # 알파벳 단어만 추출
    return re.findall(r"[A-Za-z]+", text)

In [10]:
def normalize_text(s: str, remove_stopwords=False, lower=True):
    if not isinstance(s, str):
        return ""
    x = s
    x = re.sub(r"<.*?>", " ", x)  # HTML
    x = re.sub(r"https?://\S+|www\.\S+", " ", x)  # URL
    x = x.replace("\u200b", " ").replace("\xa0", " ")
    if lower: x = x.lower()
    x = re.sub(r"[^a-z\s]", " ", x)  # 숫자/기호 제거, 영문만
    x = re.sub(r"\s+", " ", x).strip()
    toks = simple_tokenize(x)
    if remove_stopwords:
        toks = [t for t in toks if t not in EN_STOPS]
    return " ".join(toks)

In [11]:
# 본문 불용어 제거 O, 요약 불용어 제거 X
df["src_clean"] = df[SRC_COL].apply(lambda s: normalize_text(s, remove_stopwords=True))
df["tgt_clean"] = df[TGT_COL].apply(lambda s: normalize_text(s, remove_stopwords=False))

# 너무 짧거나 긴 샘플 제거
df = df[(df["src_clean"].str.split().map(len) >= 30) & (df["tgt_clean"].str.split().map(len).between(4, 20))]
df = df.reset_index(drop=True)
print(f"[정제] 필터 후 샘플 수: {len(df)}")

[정제] 필터 후 샘플 수: 94794


In [12]:
for col in list(df.columns):
    if col not in ["src_clean", "tgt_clean"]:
        try: del df[col]
        except: pass
gc.collect()

37

In [13]:
train_df, val_df = train_test_split(df[["src_clean", "tgt_clean"]], test_size=0.1, random_state=SEED, shuffle=True)

# 개발 모드: 큰 데이터면 일부만 사용 (RAM 부족시 숫자를 더 줄이세요)
USE_N = 20000
if len(train_df) > USE_N:
    train_df = train_df.sample(USE_N, random_state=SEED).reset_index(drop=True)
if len(val_df) > USE_N//10:
    val_df = val_df.sample(USE_N//10, random_state=SEED).reset_index(drop=True)

In [14]:
def build_vocab(texts, max_size=20000, min_freq=2, specials=("<pad>", "<sos>", "<eos>", "<unk>")):
    counter = Counter()
    for t in texts:
        counter.update(t.split())
    words = [w for w, c in counter.items() if c >= min_freq]
    words = sorted(words, key=lambda w: (-counter[w], w))
    words = words[:max_size - len(specials)]
    itos = list(specials) + words
    stoi = {w:i for i,w in enumerate(itos)}
    return stoi, itos

SRC_STOI, SRC_ITOS = build_vocab(train_df["src_clean"].tolist(), max_size=20000, min_freq=2)
TGT_STOI, TGT_ITOS = build_vocab(train_df["tgt_clean"].tolist(), max_size=10000, min_freq=2)
PAD_IDX, SOS_IDX, EOS_IDX, UNK_IDX = 0, 1, 2, 3

In [15]:
MAX_SRC_LEN = 120
MAX_TGT_LEN = 20

In [16]:
def encode_sentence(s, stoi, max_len, add_sos=False, add_eos=False):
    ids = [stoi.get(t, UNK_IDX) for t in s.split()]
    if add_sos: ids = [SOS_IDX] + ids
    if add_eos: ids = ids + [EOS_IDX]
    ids = ids[:max_len]
    if len(ids) < max_len:
        ids += [PAD_IDX] * (max_len - len(ids))
    return ids

In [17]:
class SummDataset(Dataset):
    def __init__(self, df_):
        self.src = df_["src_clean"].tolist()
        self.tgt = df_["tgt_clean"].tolist()
    def __len__(self): return len(self.src)
    def __getitem__(self, i):
        src_ids = encode_sentence(self.src[i], SRC_STOI, MAX_SRC_LEN, add_eos=True)
        tgt_in  = encode_sentence(self.tgt[i], TGT_STOI, MAX_TGT_LEN, add_sos=True)
        tgt_out = encode_sentence(self.tgt[i], TGT_STOI, MAX_TGT_LEN, add_eos=True)
        return torch.tensor(src_ids), torch.tensor(tgt_in), torch.tensor(tgt_out)

In [18]:
train_ds, val_ds = SummDataset(train_df), SummDataset(val_df)

BATCH_SIZE = 16
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=2)

In [19]:
EMB_SRC = 96
EMB_TGT = 96
HID = 128
ENC_LAYERS = 1
DEC_LAYERS = 1
DROPOUT = 0.2

In [20]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(emb_dim, hid, num_layers=num_layers, batch_first=True,
                            bidirectional=True, dropout=0 if num_layers==1 else dropout)
        self.fc_h = nn.Linear(hid*2, hid)
        self.fc_c = nn.Linear(hid*2, hid)
    def forward(self, src):
        emb = self.emb(src)                       # (B,S,E)
        outputs, (h, c) = self.lstm(emb)          # outputs: (B,S,2H)
        h_cat = torch.cat((h[-2], h[-1]), dim=1)  # (B,2H)
        c_cat = torch.cat((c[-2], c[-1]), dim=1)
        h0 = torch.tanh(self.fc_h(h_cat)).unsqueeze(0)  # (1,B,H)
        c0 = torch.tanh(self.fc_c(c_cat)).unsqueeze(0)  # (1,B,H)
        return outputs, (h0, c0)

In [21]:
class AdditiveAttention(nn.Module):
    def __init__(self, enc_hid, dec_hid):
        super().__init__()
        self.W1 = nn.Linear(enc_hid, dec_hid)
        self.W2 = nn.Linear(dec_hid, dec_hid)
        self.v  = nn.Linear(dec_hid, 1, bias=False)
    def forward(self, enc_outputs, dec_hidden):
        # enc_outputs: (B,S,2H), dec_hidden: (1,B,H)->(B,H)
        dec_hidden = dec_hidden.transpose(0, 1)  # (B,H)
        score = self.v(torch.tanh(self.W1(enc_outputs) + self.W2(dec_hidden).unsqueeze(1))).squeeze(-1)  # (B,S)
        attn  = torch.softmax(score, dim=1).unsqueeze(-1)   # (B,S,1)
        ctx   = torch.sum(attn * enc_outputs, dim=1)        # (B,2H)  ← S 축 합산 (핵심!)
        return ctx, attn

In [22]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hid, dec_hid, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_IDX)
        self.attn = AdditiveAttention(enc_hid, dec_hid)
        self.lstm = nn.LSTM(emb_dim + enc_hid, dec_hid, num_layers=num_layers, batch_first=True,
                            dropout=0 if num_layers==1 else dropout)
        self.fc_out = nn.Linear(dec_hid, vocab_size)
    def forward(self, y_prev, hidden, cell, enc_outputs):
        if y_prev.dim() == 1:
            y_prev = y_prev.unsqueeze(1)
        y_prev = y_prev.long()
        emb = self.emb(y_prev)                 # (B,1,E)
        ctx, _ = self.attn(enc_outputs, hidden)
        if ctx.dim() == 3:                     # 안전가드 (혹시라도)
            ctx = ctx.sum(dim=1)
        x = torch.cat([emb, ctx.unsqueeze(1)], dim=-1)  # (B,1,E+2H)
        out, (h, c) = self.lstm(x, (hidden, cell))      # out: (B,1,H)
        logits = self.fc_out(out.squeeze(1))            # (B,V)
        return logits, h, c

In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, sos_idx=SOS_IDX, eos_idx=EOS_IDX, pad_idx=PAD_IDX, max_tgt_len=MAX_TGT_LEN):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.sos = sos_idx; self.eos = eos_idx; self.pad = pad_idx
        self.max_tgt_len = max_tgt_len
    def forward(self, src, tgt_in=None, teacher_forcing=0.5):
        enc_outputs, (h, c) = self.encoder(src)
        B = src.size(0)
        y = torch.full((B,1), self.sos, dtype=torch.long, device=src.device)
        outputs = []
        steps = self.max_tgt_len if tgt_in is None else tgt_in.size(1)
        for t in range(steps):
            logits, h, c = self.decoder(y, h, c, enc_outputs)
            outputs.append(logits.unsqueeze(1))
            if tgt_in is not None and random.random() < teacher_forcing:
                y = tgt_in[:, t].unsqueeze(1)
            else:
                y = logits.argmax(-1, keepdim=True)
        return torch.cat(outputs, dim=1)  # (B,T,V)
    @torch.no_grad()
    def generate(self, src, max_len=None):
        self.eval()
        enc_outputs, (h, c) = self.encoder(src)
        B = src.size(0)
        y = torch.full((B,1), self.sos, dtype=torch.long, device=src.device)
        outputs = []
        max_len = max_len or self.max_tgt_len
        for _ in range(max_len):
            logits, h, c = self.decoder(y, h, c, enc_outputs)
            nxt = logits.argmax(-1, keepdim=True)
            outputs.append(nxt)
            y = nxt
        gen = torch.cat(outputs, dim=1)  # (B,T)
        # EOS 잘라내기
        result = []
        for seq in gen.tolist():
            out = []
            for tok in seq:
                if tok == self.eos: break
                if tok not in (self.pad, self.sos):
                    out.append(tok)
            result.append(out)
        return result

In [24]:
ENC = Encoder(len(SRC_ITOS), EMB_SRC, HID, num_layers=ENC_LAYERS, dropout=DROPOUT).to(DEVICE)
DEC = Decoder(len(TGT_ITOS), EMB_TGT, enc_hid=HID*2, dec_hid=HID, num_layers=DEC_LAYERS, dropout=DROPOUT).to(DEVICE)
model = Seq2Seq(ENC, DEC).to(DEVICE)

In [25]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
scaler = GradScaler(enabled=torch.cuda.is_available())

  scaler = GradScaler(enabled=torch.cuda.is_available())


In [26]:
GRAD_ACCUM_STEPS = 4   # 16 * 4 = 64 효과 배치
MAX_NORM = 1.0
EPOCHS = 3

In [27]:
def run_epoch(loader, train=True):
    model.train(train)
    total = 0.0
    optimizer.zero_grad(set_to_none=True)
    for step, (src, tgt_in, tgt_out) in enumerate(tqdm(loader, desc="train" if train else "valid"), start=1):
        src, tgt_in, tgt_out = src.to(DEVICE), tgt_in.to(DEVICE), tgt_out.to(DEVICE)
        with autocast(enabled=torch.cuda.is_available()):
            logits = model(src, tgt_in=tgt_in, teacher_forcing=0.5 if train else 0.0)
            loss = criterion(logits.view(-1, logits.size(-1)), tgt_out.view(-1))
            loss = loss / (GRAD_ACCUM_STEPS if train else 1)
        if train:
            scaler.scale(loss).backward()
            if step % GRAD_ACCUM_STEPS == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_NORM)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
        total += loss.item() * (GRAD_ACCUM_STEPS if train else 1)
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    return total / max(1, len(loader))

In [None]:
train_losses, val_losses = [], []
for ep in range(1, EPOCHS+1):
    tr = run_epoch(train_loader, train=True)
    vl = run_epoch(val_loader, train=False)
    train_losses.append(tr); val_losses.append(vl)
    print(f"[{ep}/{EPOCHS}] Train {tr:.4f} | Val {vl:.4f}")


  with autocast(enabled=torch.cuda.is_available()):
train:  21%|██        | 263/1250 [24:30<3:55:49, 14.34s/it]

In [None]:
# 손실 그래프 출력
plt.figure(figsize=(6,4))
plt.plot(range(1, EPOCHS+1), train_losses, label="Train")
plt.plot(range(1, EPOCHS+1), val_losses, label="Val")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()  # 저장 대신 바로 출력

In [None]:
IDX2TGT = {i:w for i,w in enumerate(TGT_ITOS)}

def decode_ids(ids):
    return " ".join([IDX2TGT.get(i, "<unk>") for i in ids])

@torch.no_grad()
def generate_texts(df_, n_samples=20):
    sample = df_.sample(n=min(n_samples, len(df_)), random_state=SEED).reset_index(drop=True)
    src_batch = [encode_sentence(s, SRC_STOI, MAX_SRC_LEN, add_eos=True) for s in sample["src_clean"]]
    src_batch = torch.tensor(src_batch, device=DEVICE)
    gen_ids = model.generate(src_batch, max_len=MAX_TGT_LEN)
    gen_texts = []
    for ids in gen_ids:
        gen_texts.append(" ".join([IDX2TGT.get(i, "<unk>") for i in ids]))
    return sample, gen_texts

sample_df, abstractive_out = generate_texts(val_df, n_samples=30)

In [None]:
def key_terms(text, top_k=8):
    toks = [t for t in simple_tokenize(text.lower()) if t not in EN_STOPS]
    freq = Counter(toks)
    return [w for w,_ in freq.most_common(top_k)]

def keyword_coverage(ref_headline, pred):
    ref = set(key_terms(ref_headline, top_k=8))
    hyp = set([t for t in simple_tokenize(pred.lower()) if t not in EN_STOPS])
    if not ref: return 0.0
    return len(ref & hyp) / len(ref)

In [None]:
def grammar_completeness(s):
    # 간이 문법 점수 (POS 미사용): 동사/조동사 단어 포함 + 알파비율 + 유니크비율 + 끝마침부호
    if not s.strip(): return 0.0
    toks = simple_tokenize(s)
    if not toks: return 0.0
    verbs_hint = {"is","are","was","were","be","being","been","have","has","had",
                  "do","does","did","will","would","can","could","should","may","might","must"}
    has_verb = 1.0 if any(t in verbs_hint or t.endswith("ed") or t.endswith("ing") for t in toks) else 0.0
    alpha_ratio = 1.0  # 이미 알파만 사용
    uniq_ratio = len(set(toks)) / max(1,len(toks))
    ends_ok = 1.0 if s.strip()[-1:] in [".","!","?"] else 0.0
    score = 0.45*has_verb + 0.25*alpha_ratio + 0.25*uniq_ratio + 0.05*ends_ok
    return float(np.clip(score, 0.0, 1.0))

In [None]:
rows = []
for i, row in sample_df.iterrows():
    head = row["tgt_clean"]
    abs_sum = abstractive_out[i]

    # Step 5. Summa 추출요약 (가볍게, 실패시 앞 2문장 대체)
    try:
        ext_sum = summa_summarize(row["src_clean"], ratio=0.25)
        if not ext_sum:
            # 아주 짧은 텍스트는 비어올 수 있음 → 앞 n단어 40개 fallback
            ext_sum = " ".join(row["src_clean"].split()[:40])
    except Exception:
        ext_sum = " ".join(row["src_clean"].split()[:40])

    rows.append({
        "original_headline": head,
        "abstractive_summary": abs_sum,
        "extractive_summary": ext_sum,
        "keyword_coverage_abs": round(keyword_coverage(head, abs_sum), 3),
        "keyword_coverage_ext": round(keyword_coverage(head, ext_sum), 3),
        "grammar_score_abs": round(grammar_completeness(abs_sum), 3),
        "grammar_score_ext": round(grammar_completeness(ext_sum), 3),
        "headline_key_terms": ", ".join(key_terms(head, top_k=8)),
    })

In [None]:
result_df = pd.DataFrame(rows)
print(result_df)