In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random
import numpy as np
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, LongformerForSequenceClassification, LongformerTokenizer, DebertaV2ForSequenceClassification
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments, PreTrainedTokenizer
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import BertModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
import torch.nn.functional as F
from types import SimpleNamespace
import re
from torch.optim import AdamW
from sklearn.model_selection import KFold
import os
from libauc.losses import AUCMLoss
from libauc.optimizers import PESG
import transformers
import gc

In [2]:
CONFIG = {
    "data_base": "../data"
}

In [3]:
train_csv = pd.read_csv(f"{CONFIG['data_base']}/pseudo_labeling.csv")
test_csv = pd.read_csv(f"{CONFIG['data_base']}/test.csv")

In [None]:
train_csv = train_csv.rename(columns={
    'paragraph': 'full_text',
    'paragraph_label': 'generated'
    })

train_csv.head()

In [None]:
label_0 = train_csv[train_csv['generated'] == 0] # 0 or 1
label_0 = label_0.sample(frac=1, random_state=42).reset_index(drop=True)
label_0

In [None]:
model_id = "rtzr/ko-gemma-2-9b-it"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

pipeline.model.eval()

In [7]:
def prompt_gen(text):
    return (
        f"이 문장을 비슷한 문장으로 다시 작성해줘.\n"
        + f"다른 부수적인 말하지 말고 오직 비슷한 문장 1개만 말해주면 돼.\n"
        + f"문장: {text}"
    )

In [8]:
def generate(text, pipeline):
    messages = [
        {"role": "user", "content": prompt_gen(text)}
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<end_of_turn>")
    ]

    with torch.no_grad():  # memory leak 방지
        outputs = pipeline(
            prompt,
            max_new_tokens=2048,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )

    torch.cuda.empty_cache()  # GPU 캐시 비움
    return outputs[0]["generated_text"][len(prompt):]


In [None]:
augmentation = {
    "title" : [],
    "full_text" : [],
    "generated" : [],
}

for _, i in tqdm(label_0.iterrows(), total=len(label_0)):
    title = i['title']
    text = i['full_text']
    label = i['generated']

    aug_text = generate(text, pipeline)

    augmentation['title'].append(title)
    augmentation['full_text'].append(aug_text.strip().replace("\n", ""))
    augmentation['generated'].append(1)

    pd_aug = pd.DataFrame(augmentation)
    pd_aug.to_csv("./augmentation_data/aug_0.csv", index=False)

    gc.collect()
    torch.cuda.empty_cache()


: 