In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd drive/MyDrive/work/korean-hate-speech-detection

/content/drive/MyDrive/work/korean-hate-speech-detection


In [3]:
%ls

'=0.20.1'               [0m[01;34mllama2-hoxy[0m/             [01;34mourfuckingmodel[0m/
 [01;34mdata[0m/                  llamatest.py             [01;34mpretrained_model_directory[0m/
 dev.hate1.csv          [01;34mmodelsave1[0m/              printprob.py
 dev.hate.csv           [01;34mmodelsave2[0m/              [01;34mreal_final[0m/
 dev.news_title.txt     [01;34mmodelsave3[0m/              [01;34mresults[0m/
 epoch_1_results.csv    [01;34mmodelsave4[0m/              setup_logs.txt
 epoch_24_results.csv   [01;34mmodelsave5[0m/              test.hate.no_label.csv
 epoch_2_results.csv    [01;34mmodelsave6[0m/              test.news_title.txt
 epoch_3_results.csv    [01;34mmodelsave7[0m/              [01;34mthdmodel[0m/
 epoch_4_results.csv    [01;34mmy_autotrain_llm[0m/        [01;34mtmp_trainer[0m/
 epoch_5_results.csv    offensive.csv            train.hate.csv
 [01;34minferenced[0m/            offensive_data_set.csv   train.news_title.txt
 install

In [4]:
%pip install transformers



In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import pandas as pd
from torch import sigmoid
import os

device = torch.device("cuda:0")

# Create or load a pretrained model
model = AutoModelForSequenceClassification.from_pretrained("pretrained_model_directory")

In [6]:
hate_data = pd.read_csv("train.hate.csv")
offensive_data = pd.read_csv("offensive_data_set.csv")
dev_data = pd.read_csv("dev.hate.csv")
dev_data['label'].replace("hate", 1, inplace=True)
dev_data['label'].replace("none", 0, inplace=True)
dev_data['label'].replace("offensive", 0.5, inplace=True)
#dev_data = dev_data[dev_data['label'] != 0.5]

In [7]:
hate_data['label'].replace("hate", 1, inplace=True)
hate_data['label'].replace("none", 0, inplace=True)
hate_data['label'].replace("offensive", 0.5, inplace=True)
#hate_data = hate_data[hate_data['label'] != 0.5]

In [8]:
# BERT 다국어 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 문장을 토큰화하고 패딩 적용
tokenized_texts = [
    tokenizer.encode(text, add_special_tokens=True) for text in hate_data["comments"]
]

max_length = max(len(tokens) for tokens in tokenized_texts)
padded_texts = [tokens + [0] * (max_length - len(tokens)) for tokens in tokenized_texts]

# 텐서로 변환
hate_input_ids = (torch.tensor(padded_texts)).to(device)
hate_labels = (torch.tensor(hate_data["label"].tolist())).to(device)

# 문장을 토큰화하고 패딩 적용
tokenized_texts = [
    tokenizer.encode(text, add_special_tokens=True) for text in offensive_data["comments"]
]

max_length = max(len(tokens) for tokens in tokenized_texts)
padded_texts = [tokens + [0] * (max_length - len(tokens)) for tokens in tokenized_texts]

offensive_input_ids = (torch.tensor(padded_texts)).to(device)
offensive_labels = (torch.tensor(offensive_data["label"].tolist())).to(device)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [9]:
hate_dataset = TensorDataset(hate_input_ids, hate_labels)
hate_data_loader = DataLoader(hate_dataset, batch_size=4, shuffle=True)

offensive_dataset = TensorDataset(offensive_input_ids, offensive_labels)
offensive_data_loader = DataLoader(offensive_dataset, batch_size=4, shuffle=True)

In [10]:
model.to(device)

# 옵티마이저 및 손실 함수 초기화
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

In [11]:
import csv

In [None]:
# 학습 및 평가
n_epochs = 24
for epoch in range(n_epochs):
    model.train()
    hate_total_loss = 0.0
    hate_total_batches = len(hate_data_loader)
    for batch_idx, batch in enumerate(hate_data_loader):
        input_ids, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids)
        logits = outputs.logits.squeeze(1)
        loss = criterion(logits, labels.float())
        loss.backward()
        optimizer.step()
        hate_total_loss += loss.item()

        # 출력 형식: Epoch [현재 에폭], Batch [현재 배치 / 전체 배치 수], Loss: [현재 배치 손실]
        print(
            f"Epoch {epoch + 1}, Batch {batch_idx + 1} / {hate_total_batches}, Loss: {loss.item()}"
        )

    # 평가
    model.eval()
    predictions = []
    true_labels = []
    probabilities_offensive = []

    # offensive data에 대하여 평가
    # logit 값에 따라 혐오 강도가 결정되는지 확인하기 위함
    for batch in offensive_data_loader:
        input_ids, labels = batch
        outputs = model(input_ids)
        logits = outputs.logits.squeeze(1)
        prob_offensive = sigmoid(logits)
        predictions.extend((prob_offensive >= 0.5).float().tolist())  # 0.5를 기준으로 이진 분류
        true_labels.extend(labels.tolist())
        probabilities_offensive.extend(prob_offensive.tolist())

        with open(f'epoch_{epoch+1}_results.csv', 'w', newline='') as file:
          writer = csv.writer(file)
          writer.writerow(['Probability_Offensive'])
          for prob in probabilities_offensive:
            writer.writerow([prob])



    #accuracy = accuracy_score(true_labels, predictions)
    #auc_roc = roc_auc_score(true_labels, predictions)
    #f1 = f1_score(true_labels, predictions)

    #print(
        #f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}, Accuracy: {accuracy}, Auc: {auc_roc}, f1: {f1}"
        #f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}, f1: {f1}"
    #)

    print(probabilities_offensive)

    # 모델 저장
model.save_pretrained("offensive_directory")  # type: ignore


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 22, Batch 926 / 1974, Loss: 0.6934288740158081
Epoch 22, Batch 927 / 1974, Loss: 0.023286057636141777
Epoch 22, Batch 928 / 1974, Loss: 0.17441022396087646
Epoch 22, Batch 929 / 1974, Loss: 0.5412529706954956
Epoch 22, Batch 930 / 1974, Loss: 0.0005543653387576342
Epoch 22, Batch 931 / 1974, Loss: 0.3472605049610138
Epoch 22, Batch 932 / 1974, Loss: 0.17361415922641754
Epoch 22, Batch 933 / 1974, Loss: 0.17449158430099487
Epoch 22, Batch 934 / 1974, Loss: 0.1738089621067047
Epoch 22, Batch 935 / 1974, Loss: 0.5214084386825562
Epoch 22, Batch 936 / 1974, Loss: 0.17523232102394104
Epoch 22, Batch 937 / 1974, Loss: 0.0009060631273314357
Epoch 22, Batch 938 / 1974, Loss: 0.17398466169834137
Epoch 22, Batch 939 / 1974, Loss: 0.347292959690094
Epoch 22, Batch 940 / 1974, Loss: 0.17370176315307617
Epoch 22, Batch 941 / 1974, Loss: 0.3472590744495392
Epoch 22, Batch 942 / 1974, Loss: 0.0006160538760013878
Epoch 22, Batch 943 / 1974, Loss: