<a href="https://colab.research.google.com/github/yoonseongan/2024_2_NLP/blob/main/%EA%B0%90%EC%84%B1%EB%B6%84%EC%84%9D%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# **Step 1: 데이터 로드 및 전처리**
# CSV 파일 로드
data = pd.read_csv("/content/naverReview.csv")

# 별점(1~5)을 0(부정) 또는 1(긍정)으로 변환
def changeTo01(x):
    return 0 if x < 3 else 1

data['star'] = data['star'].apply(changeTo01)

# 리뷰와 라벨로 데이터 정리
data_list = [[review, label] for review, label in zip(data['review'], data['star'])]

# 학습 데이터와 테스트 데이터로 분리
train, test = train_test_split(data_list, test_size=0.2, shuffle=True, random_state=42)

# **Step 2: 데이터셋 클래스 정의**
class ReviewDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review, label = self.data[idx]
        encoding = self.tokenizer(
            review,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# **Step 3: Hugging Face의 KoBERT 모델 및 토크나이저 로드**
MODEL_NAME = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# **Step 4: 데이터로더 생성**
max_len = 64
batch_size = 32

train_dataset = ReviewDataset(train, tokenizer, max_len)
test_dataset = ReviewDataset(test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# **Step 5: 학습 준비**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

loss_fn = nn.CrossEntropyLoss()

# **Step 6: 모델 학습**
epochs = 3
for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_acc = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        train_acc += (preds == labels).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_dataset)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}")

# **Step 7: 모델 평가**
model.eval()
test_loss = 0
test_acc = 0

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        test_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        test_acc += (preds == labels).sum().item()

test_loss /= len(test_loader)
test_acc /= len(test_dataset)
print(f"Test Loss = {test_loss:.4f}, Test Accuracy = {test_acc:.4f}")

# **Step 8: 모델 저장**
torch.save(model.state_dict(), "/content/kobert_sentiment_model.pt")
tokenizer.save_pretrained("/content/kobert_sentiment_model.pt")
print("Model and tokenizer saved!")

# **Step 9: 예측 함수**
def predict(sentence):
    model.eval()
    encoding = tokenizer(
        sentence,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return "긍정적" if pred == 1 else "부정적"

# **Step 10: 사용자 입력 예측**
while True:
    sentence = input("리뷰를 입력하세요 (종료하려면 0 입력): ")
    if sentence == "0":
        print("프로그램 종료!")
        break
    print(f"리뷰 예측: {predict(sentence)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

tokenization_kobert.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/monologg/kobert:
- tokenization_kobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5000/5000 [27:24<00:00,  3.04it/s]


Epoch 1: Train Loss = 0.2494, Train Accuracy = 0.9081


100%|██████████| 5000/5000 [27:30<00:00,  3.03it/s]


Epoch 2: Train Loss = 0.1792, Train Accuracy = 0.9379


100%|██████████| 5000/5000 [27:29<00:00,  3.03it/s]


Epoch 3: Train Loss = 0.1397, Train Accuracy = 0.9535


100%|██████████| 1250/1250 [02:20<00:00,  8.87it/s]


Test Loss = 0.1921, Test Accuracy = 0.9350


TypeError: KoBertTokenizer.save_vocabulary() got an unexpected keyword argument 'filename_prefix'

In [5]:
tokenizer.save_pretrained("/content/kobert_sentiment_model.pt")
print("Model and tokenizer saved!")

# **Step 9: 예측 함수**
def predict(sentence):
    model.eval()
    encoding = tokenizer(
        sentence,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return "긍정적" if pred == 1 else "부정적"

# **Step 10: 사용자 입력 예측**
while True:
    sentence = input("리뷰를 입력하세요 (종료하려면 0 입력): ")
    if sentence == "0":
        print("프로그램 종료!")
        break
    print(f"리뷰 예측: {predict(sentence)}")

Provided path (/content/kobert_sentiment_model.pt) should be a directory, not a file


Model and tokenizer saved!
리뷰를 입력하세요 (종료하려면 0 입력): 거 랑 주문진 해변 끝 이지만 예쁘니까 아무튼   주말 이라서 그런지 추운 날인데도 줄 서서 사진 찍었어요
리뷰 예측: 긍정적
리뷰를 입력하세요 (종료하려면 0 입력): 0
프로그램 종료!


In [12]:

import pandas as pd
# **Step 1: 데이터 로드**
# 파일에서 데이터를 로드합니다.
data = pd.read_csv("충주_2_토큰화.csv")  # 파일 경로에 맞게 수정
#data = data[["cleaned_review"]]  # 필요한 컬럼만 선택


# **Step 3: 감성 예측 함수 정의**
def predict(sentence):
    """
    입력된 문장을 감성 분석 (긍정/부정)으로 예측합니다.
    """
    encoding = tokenizer(
        sentence,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():  # 추론 시 그래디언트 비활성화
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return "긍정적" if pred == 1 else "부정적"

# **Step 4: 데이터에 감성 예측 추가**
# "cleaned_review" 컬럼을 기반으로 예측 결과를 새 열에 추가
data["sentiment"] = data["cleaned_review"].apply(predict)

# **Step 5: 결과 저장**
# 감성 분석 결과를 포함한 데이터를 CSV로 저장
output_file = "충주_with_sentiment.csv"
data.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"분석 결과가 저장되었습니다: {output_file}")

분석 결과가 저장되었습니다: 충주_with_sentiment.csv
