<a href="https://colab.research.google.com/github/tonykorea99/Prototype/blob/main/RoBERTa_in-context_lerning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:

In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import requests
from io import StringIO
import re

# 1. 불용어 제거 함수와 전처리 함수 정의
def remove_stopwords(text):
    korean_stopwords = ['은', '는', '이', '가', '을', '를', '의', '에', '에서', '와', '과', '도', '로', '으로', '그리고', '하지만',
                        '그러나', '따라서', '그러므로', '그래서', '또한', '매우', '아주', '더', '가장', '너무', '항상', '자주',
                        '때로', '가끔', '이', '그', '저', '나', '너', '우리', '당신', '여러분']
    words = text.split()
    filtered_words = [word for word in words if word not in korean_stopwords]
    return ' '.join(filtered_words)

def preprocess_text(text):
    text = re.sub(r'http[s]?://\S+', '[URL]', text)
    text = re.sub(r'[^가-힣\s]', '', text)
    text = remove_stopwords(text)
    if len(text.strip()) > 4:
        return text.strip()
    return None

# 2. 데이터 불러오기 및 전처리
def load_and_preprocess_data():
    user = "tonykorea99"
    repo = "Spam-alart"
    branch = "main"
    directory = "moddata"

    url = f"https://api.github.com/repos/{user}/{repo}/contents/{directory}?ref={branch}"
    response = requests.get(url)
    files = response.json()

    csv_files = [file for file in files if file['name'].endswith('.csv')]

    all_data = []
    for file in csv_files:
        raw_url = file['download_url']
        csv_response = requests.get(raw_url)
        df = pd.read_csv(StringIO(csv_response.text))

        if 'v1' in df.columns and 'v2' in df.columns:
            df['processed_text'] = df['v2'].apply(preprocess_text)
        df = df.dropna(subset=['processed_text', 'v1'])  # 전처리 후 None 값 제거
        all_data.append(df[['v1', 'processed_text']])

    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

# 3. 데이터 불러오기 및 전처리 실행
data = load_and_preprocess_data()

# 4. 라벨 매핑 ('spam' -> 1, 'ham' -> 0)
label_mapping = {'spam': 1, 'ham': 0}
data['label'] = data['v1'].map(label_mapping)

# 5. None 값 제거 (label에도 None이 있을 경우 제거)
data = data.dropna(subset=['label'])

print("전처리된 데이터 샘플:")
print(data.head())

# 6. RoBERTa 토크나이저 및 모델 불러오기
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 7. 토크나이징 적용
def tokenize_function(examples):
    return tokenizer(examples['processed_text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = Dataset.from_pandas(data[['processed_text', 'label']])
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# 8. 평가지표 계산 함수 (scikit-learn 사용)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 9. 학습 설정 (배치 사이즈 128로 설정)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 10. 모델 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
eval_results = trainer.evaluate()
print(f"Fine-tuning 방식의 정확도: {eval_results['eval_accuracy']}")


전처리된 데이터 샘플:
     v1                                     processed_text  label
0  spam          국외발신통관세금안내관세미납 원 즉시 납부 바랍니다미납시 강제집행예정민원센터    1.0
1  spam              국외발신고객님관세청통관물품 세금확인 관세세금원통관번호자동처리예정문의    1.0
2  spam                           발신년 고객님건강검사 통지서 발송완료상세확인    1.0
3  spam             국외발신통관세금안내관세미납원 미납시 민사소송 고발조치예정입니다민원접수    1.0
4  spam  발신삼도농협택배일정알림 월 추석명절 택배 일정 안내드립니다접수마감 월 일월단식품류는...    1.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/96491 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 17.06 MiB is free. Process 9121 has 14.73 GiB memory in use. Of the allocated memory 14.54 GiB is allocated by PyTorch, and 68.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# 3. 데이터 불러오기 및 전처리 실행
data = load_and_preprocess_data()
print("전처리된 데이터 샘플:")
print(data.head())

# 4. 라벨을 추가 ('spam' -> 1, 'ham' -> 0)
label_mapping = {'spam': 1, 'ham': 0}
data['label'] = data['v1'].map(label_mapping)

# 5. RoBERTa 토크나이저 및 모델 불러오기 (Fine-tuning 없이 사용)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 6. In-context learning 적용 (Fine-tuning 없이 추론만 수행)
actual_labels = []
predicted_labels = []

for index, row in data.iterrows():
    text = row['processed_text']  # 전처리된 텍스트
    actual_label = row['label']  # 실제 라벨

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():  # Fine-tuning 없이 예측만 진행
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=-1).item()  # 예측 라벨

        actual_labels.append(actual_label)
        predicted_labels.append(predicted_label)

# 7. 정확도 계산
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(actual_labels, predicted_labels)
print(f"In-context learning 방식의 정확도: {accuracy}")