In [None]:
!pip install kobert-transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.model_selection import train_test_split


In [None]:
train = pd.read_table("/content/ratings_train.txt")
test = pd.read_table("/content/ratings_test.txt")

In [None]:
train.head()
test.head()

In [None]:
train.info()

In [None]:
# 데이터 전처리: 결측치 제거
train = train.dropna(subset=['document'])
test = test.dropna(subset=['document'])

In [None]:
from kobert_transformers import get_kobert_model, get_tokenizer
from torch.optim import AdamW

# KoBERT 모델 및 토크나이저 로드
model = get_kobert_model()
tokenizer = get_tokenizer()


In [None]:
# PyTorch 데이터셋 및 데이터 로더 정의
from torch.utils.data import Dataset, DataLoader
import torch

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(label)}

    def __len__(self):
        return len(self.texts)

# 데이터셋 준비
train_dataset = SentimentDataset(train['document'].tolist(), train['label'].tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [None]:
# 모델 학습 및 평가
import torch.nn as nn

# 감성 분석을 위한 분류 모델 정의
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=2):
        super(SentimentClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        output = self.dropout(pooled_output)
        return self.classifier(output)

# 모델 초기화
model = SentimentClassifier(model)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 학습 루프 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # 원하는 epoch 수 설정
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()


In [None]:
import torch.nn.functional as F

def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    outputs = model(inputs['input_ids'], inputs['attention_mask'])
    probs = F.softmax(outputs, dim=1)
    positive_percent = probs[0][1].item() * 100  # 긍정 확률
    return positive_percent

# 예측 예시
text = "이 영화 정말 재미있다!"
positive_percent = predict_sentiment(text)
print(f"긍정 비율: {positive_percent:.2f}%")
