In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from google.colab import drive

In [2]:
# 구글 드라이브 마운트 및 작업 디렉토리 변경
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/14_자연어 처리 심화_최종/data')

Mounted at /content/drive


In [3]:
# Load data
train_data_path = '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/nikluge-au-2022-train.csv'
test_data_path = '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/nikluge-au-2022-dev.csv'
train_data = pd.read_csv(train_data_path, sep=',')
test_data = pd.read_csv(test_data_path, sep=',')

In [4]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['input'], train_data['output'], test_size=0.2, random_state=42)

In [5]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [7]:
# Define dataset class
class HateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [8]:
# Create datasets and data loaders
MAX_LEN = 128
train_dataset = HateDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = HateDataset(X_val, y_val, tokenizer, MAX_LEN)
test_dataset = HateDataset(test_data['input'], test_data['output'], tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [9]:
# Set optimizer, loss function, and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True)



In [10]:
# Model training and validation
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs, labels=batch['labels'].to(device))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # Validation process
    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += accuracy_score(batch['labels'].cpu().numpy(), predictions.cpu().numpy())
    avg_train_loss = total_loss / len(train_loader)
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_train_loss:.3f} | Validation Accuracy: {avg_val_accuracy:.3f}")
    # Update learning rate scheduler
    scheduler.step(avg_val_accuracy)

100%|██████████| 208/208 [04:09<00:00,  1.20s/it]


Epoch 1/4 | Loss: 0.283 | Validation Accuracy: 0.944


100%|██████████| 208/208 [04:12<00:00,  1.22s/it]


Epoch 2/4 | Loss: 0.121 | Validation Accuracy: 0.948


100%|██████████| 208/208 [04:13<00:00,  1.22s/it]


Epoch 3/4 | Loss: 0.068 | Validation Accuracy: 0.934


100%|██████████| 208/208 [04:12<00:00,  1.21s/it]


Epoch 4/4 | Loss: 0.045 | Validation Accuracy: 0.949


In [11]:
# Final evaluation on test data
model.eval()
total_test_accuracy = 0
for batch in test_loader:
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    total_test_accuracy += accuracy_score(batch['labels'].cpu().numpy(), predictions.cpu().numpy())
avg_test_accuracy = total_test_accuracy / len(test_loader)
print(f"Test Accuracy: {avg_test_accuracy:.3f}")

Test Accuracy: 0.952


In [12]:
# Save the trained model
output_dir = '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model/tokenizer_config.json',
 '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model/special_tokens_map.json',
 '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model/vocab.txt',
 '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model/added_tokens.json',
 '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model/tokenizer.json')

In [None]:
# 예제 데이터 로드
example_data_path = '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/example.csv'
example_data = pd.read_csv(example_data_path)

# 토크나이저와 모델 로드
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model")
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/14_자연어 처리 심화_최종/data/korean_hate_serch_Model")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 예제 데이터셋 클래스 정의
class ExampleDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

# 예제 데이터 로더 생성
example_dataset = ExampleDataset(example_data['input'], tokenizer)
example_loader = DataLoader(example_dataset, batch_size=16)

# 모델을 사용한 예측
model.eval()
predictions = []
with torch.no_grad():
    for batch in example_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

# 예측 결과를 데이터프레임에 추가
example_data['output'] = predictions

# 결과를 새 CSV 파일로 저장
output_path = '/content/drive/MyDrive/14_자연어 처리 심화_최종/data/prediction_example.csv'
example_data.to_csv(output_path, index=False)

In [None]:
# 별첨 csv to jsonl
import pandas as pd
import json

# CSV 파일 불러오기
df = pd.read_csv('prediction_example.csv', encoding='utf-8')

# JSON Lines 형식으로 저장
with open('prediction_example.jsonl', 'w', encoding='utf-8') as f:
    for _, row in df.iterrows():
        row_dict = row.to_dict()
        json.dump(row_dict, f, ensure_ascii=False)
        f.write('\n') 

In [None]:
# 별첨 2 jsonl to csv
import json
import csv

def jsonl_to_csv(jsonl_file, csv_file):
    # CSV 파일 쓰기
    with open(csv_file, 'w', newline='', encoding='utf-8 sig') as csvf:
        writer = csv.writer(csvf)

        # JSONL 파일 읽기
        with open(jsonl_file, 'r', encoding='utf-8') as jsonlf:
            # 줄 번호 카운터 초기화
            line_count = 0

            for line in jsonlf:
                # JSON 문자열을 Python 사전으로 파싱
                data = json.loads(line)
                
                # 처음에 헤더를 쓰기
                if line_count == 0:
                    writer.writerow(data.keys())
                
                # 데이터 쓰기
                writer.writerow(data.values())
                
                # 줄 번호 카운터 증가
                line_count += 1

# JSONL 파일과 CSV 파일의 경로 설정
jsonl_file = 'nikluge-au-2022-train.jsonl'
csv_file = 'train_output.csv'

# JSONL을 CSV로 변환하여 저장
jsonl_to_csv(jsonl_file, csv_file)