<a href="https://colab.research.google.com/github/whitechocobread/Ai-project/blob/main/%EA%B8%B0%EB%A7%90%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B8/koELECTRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install transformers
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 엑셀 파일 불러오기
file_path = '/content/drive/MyDrive/data.xlsx'
df = pd.read_excel(file_path)

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(df['Sentence'], df['emotions'], test_size=0.2, random_state=42)

# 학습 데이터와 테스트 데이터를 DataFrame으로 변환
train_data = pd.DataFrame({'Sentence': X_train, 'emotions': y_train})
test_data = pd.DataFrame({'Sentence': X_test, 'emotions': y_test})

# 결과 확인
train_data.head(), test_data.head()


(                                     Sentence  emotions
 38     삼성전자, 범용 메모리 제품 생산 능력 감소시키며 고부가 제품에 집중         2
 143                             BaaS 사업 추진 계획         0
 84   갤럭시 S23 FE, '갤럭시 트라이얼' 프로그램으로 체험 후 반품 가능        -2
 55             올해 삼성전자는 임원급 영입과 주니어급 지원 문턱 낮춤         2
 219                              전기차 가격 인하 경쟁        -2,
                                  Sentence  emotions
 190            엘지에너지솔루션 및 다른 기업들의 2차전지 전시         1
 6        삼성디스플레이 폴더블 패널 점유율, BOE에 역전당할 전망        -1
 79   SK하이닉스의 D램 영업이익 1조5620억원으로 상당한 증가 예상         1
 205                  울산의 2차전지 산업 발전 방안 논의         1
 117       ‘2차전지 현재와 미래를 한눈에’…울산 K배터리 쇼 개최         1)

In [3]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [4]:
# GPU 사용
device = torch.device("cuda")

In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AdamW

class news_Dataset(Dataset):
    def __init__(self, data, tokenizer_name="monologg/koelectra-small-v2-discriminator"):
        # 데이터셋 초기화
        self.dataset = data
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        text = row['emotions']  # 또는 적절한 열 이름
        y = row['Sentence']  # 또는 적절한 열 이름

        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
        )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y



In [9]:
train_data = pd.DataFrame({'emotions': X_train, 'Sentence': y_train})
train_dataset =news_Dataset(train_data)
test_data = pd.DataFrame({'emotions': X_train, 'Sentence': y_train})
test_dataset =news_Dataset(train_data)

In [10]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
epochs = 5
batch_size = 16
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)



In [15]:
train_data.head()

Unnamed: 0,emotions,Sentence
38,"삼성전자, 범용 메모리 제품 생산 능력 감소시키며 고부가 제품에 집중",2
143,BaaS 사업 추진 계획,0
84,"갤럭시 S23 FE, '갤럭시 트라이얼' 프로그램으로 체험 후 반품 가능",-2
55,올해 삼성전자는 임원급 영입과 주니어급 지원 문턱 낮춤,2
219,전기차 가격 인하 경쟁,-2


In [22]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
exportTORCH_USE_CUDA_DSA=1

In [23]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/13 [00:00<?, ?it/s]

RuntimeError: ignored

In [19]:
losses, accuracies

([], [])

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)