<a href="https://colab.research.google.com/github/yongjulee0213/SentenceClassifier/blob/main/pj1_mycode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1.필요한 라이브러리 임포트

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW
)
import pandas as pd

In [None]:
import torch.nn as nn

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mtarcon3[0m ([33mgoorm[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import wandb

# 랜덤시드 설정

In [None]:
config={
    'epochs':3,
    'learning_rate':5e-5,
    'seed': 41
}


def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(config['seed'])
device=torch.device('cuda')

# 2. 데이터 전처리

데이터가 문장으로 이루어져 있으므로, column 만들어서 review, label이 있는 csv 파일로 만들기

In [None]:
#데이터 가져오기
#코랩에 training, validation, test data 업로드
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: ignored

In [None]:
#함수 : 파일에서 문장 가져오기  +dataframe으로 만들기
def file_to_sentence_dataframe(file_name):
  file_data=[]
  with open(file_name,'r',encoding='utf-8') as f:
    for sentence in f:
      file_data.append(str(sentence.lower()[:-1].strip()))#모두 소문자로 만들기+공백제거
  file_csv=pd.DataFrame({'review':file_data,'label':int(file_name[-1])})
  return file_csv

In [None]:
train_pos=file_to_sentence_dataframe('sentiment.train.1')
train_neg=file_to_sentence_dataframe('sentiment.train.0')
dev_pos=file_to_sentence_dataframe('sentiment.dev.1')
dev_neg=file_to_sentence_dataframe('sentiment.dev.0')

In [None]:
train_pos.head()

Unnamed: 0,review,label
0,excellent food .,1
1,superb customer service .,1
2,they also have daily specials and ice cream wh...,1
3,it 's a good toasted hoagie .,1
4,the staff is friendly .,1


In [None]:
dev_pos.head()

Unnamed: 0,review,label
0,these donuts have the perfect texture and taste .,1
1,good food for the price .,1
2,"a little dirty on the inside , but wonderful p...",1
3,i always order it when i go there and it is al...,1
4,the rest of the food there is good also and no...,1


In [None]:
#train, validation용으로 합치고 인덱스 새로 맞춰주기
train_data=pd.concat([train_pos, train_neg])
dev_data=pd.concat([dev_pos, dev_neg])

train_data=train_data.reset_index(drop=True)
dev_data=dev_data.reset_index(drop=True)

In [None]:
train_data

Unnamed: 0,review,label
0,excellent food .,1
1,superb customer service .,1
2,they also have daily specials and ice cream wh...,1
3,it 's a good toasted hoagie .,1
4,the staff is friendly .,1
...,...,...
443254,this place sucks .,0
443255,does n't stop by to see if you need anything .,0
443256,food is terrible .,0
443257,service horrible .,0


In [None]:
dev_data

Unnamed: 0,review,label
0,these donuts have the perfect texture and taste .,1
1,good food for the price .,1
2,"a little dirty on the inside , but wonderful p...",1
3,i always order it when i go there and it is al...,1
4,the rest of the food there is good also and no...,1
...,...,...
3995,i am sad to see how much this place has gone d...,0
3996,the food here is n't very good .,0
3997,it has n't been for quite a few years .,0
3998,the service the last time i went was just terr...,0


In [None]:
train_data['review'][0]

'excellent food .'

#3. 토크나이저 선언하기

문장 데이터를 숫자데이터로 바꾸기

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
model_name='bert-base-uncased'
tokenizer=BertTokenizer.from_pretrained(model_name)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
#수치화하기
def make_id(tokenizer,data):
  id_file=[]
  for sentence in data['review']:
    id_file.append(tokenizer.encode(sentence))
  return id_file

In [None]:
train_data_id=make_id(tokenizer, train_data)
dev_data_id=make_id(tokenizer, dev_data)

In [None]:
train_data_id[:5] #숫자화 확인

[[101, 6581, 2833, 1012, 102],
 [101, 21688, 8013, 2326, 1012, 102],
 [101,
  2027,
  2036,
  2031,
  3679,
  19247,
  1998,
  3256,
  6949,
  2029,
  2003,
  2428,
  2204,
  1012,
  102],
 [101,
  2009,
  1005,
  1055,
  1037,
  2204,
  15174,
  2098,
  7570,
  22974,
  2063,
  1012,
  102],
 [101, 1996, 3095, 2003, 5379, 1012, 102]]

In [None]:
class SentimentTestDataset(object):
    def __init__(self,dataframe_ids, dataframe):
        self.dataframe=dataframe
        self.dataframe_ids=dataframe_ids

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return np.array(self.dataframe_ids[index]),np.array(self.dataframe['label'][index])

In [None]:
#데이터 셋 구성 ; get_item ; 숫자화된 리뷰데이터와 레이블 가져오기
train_dataset = SentimentTestDataset(train_data_id,train_data)
dev_dataset = SentimentTestDataset(dev_data_id,dev_data)

In [None]:
train_dataset[:5] #확인

  return np.array(self.dataframe_ids[index]),np.array(self.dataframe['label'][index])


(array([list([101, 6581, 2833, 1012, 102]),
        list([101, 21688, 8013, 2326, 1012, 102]),
        list([101, 2027, 2036, 2031, 3679, 19247, 1998, 3256, 6949, 2029, 2003, 2428, 2204, 1012, 102]),
        list([101, 2009, 1005, 1055, 1037, 2204, 15174, 2098, 7570, 22974, 2063, 1012, 102]),
        list([101, 1996, 3095, 2003, 5379, 1012, 102])], dtype=object),
 array([1, 1, 1, 1, 1]))

In [None]:
dev_dataset[:5]#확인

  return np.array(self.dataframe_ids[index]),np.array(self.dataframe['label'][index])


(array([list([101, 2122, 2123, 16446, 2031, 1996, 3819, 14902, 1998, 5510, 1012, 102]),
        list([101, 2204, 2833, 2005, 1996, 3976, 1012, 102]),
        list([101, 1037, 2210, 6530, 2006, 1996, 2503, 1010, 2021, 6919, 2111, 2008, 2147, 2045, 999, 102]),
        list([101, 1045, 2467, 2344, 2009, 2043, 1045, 2175, 2045, 1998, 2009, 2003, 2467, 12476, 1012, 102]),
        list([101, 1996, 2717, 1997, 1996, 2833, 2045, 2003, 2204, 2036, 1998, 2025, 2200, 6450, 1012, 102])],
       dtype=object), array([1, 1, 1, 1, 1]))

In [None]:
def collate_fn_style(samples):#가변길이 배치를 패딩하는데 사용?samples ; train_dataset/dev_dataset
    input_ids_, labels = zip(*samples)#input_ids 수치화된 리뷰데이터, labels ; 긍부정

    max_len = max(len(input_id) for input_id in input_ids_)#42 : 가장 문장의 길이가 긴 것을 찾자
    
    #[::-1] ; 문자열 거꾸로 출력하기
    #문장길이를 작은것->큰것으로 정렬한 인덱스를 취해서 반대로 정렬 -> 왼쪽이 길이가 긴것. 오른쪽으로 갈수록 길이가 짧아짐
    sorted_indices = np.argsort([len(input_id) for input_id in input_ids_])[::-1]

    #pad_sequence : 일정한 길이로 맞춰줄 때 사용. 빈공간은 0 으로 채워짐.
    '''
    pad_sequence(sequences, batch_first,padding_value)
    sequence ; 가변길이 시퀀스 리스트 ; 길이가 다양한 문장 시퀀스
    batch_first ;  B x T x * if True, or in T x B x * otherwise
    '''
    input_ids = pad_sequence([torch.tensor(input_ids_[index]) for index in sorted_indices],
                             batch_first=True)
    
    #attention 연산이 수행되어야할 토큰과 그렇지 않은 토큰 구분
    #인덱스에 해당하는 패딩된 수치로 표현된 문장의 전체길이의 크기만큼 1로 채운다 =연산수행
    #padding으로 채워진 부분은 attention 연산이 수행될 필요가 없으므로 0 부여.
    attention_mask = torch.tensor(
        [[1] * len(input_ids_[index]) + [0] * (max_len - len(input_ids_[index])) for index in
         sorted_indices])
    
    #각 토큰이 어떤 문장에 속하는지 나타냄 : 첫번째 문장이 0, 두번째 문장이 1
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [None]:
train_batch_size=32
eval_batch_size=64
#collate_fn : dataset이 고정길이가 아닌 경우, 배치를 2 이상으로 dataloader를 호출하면
#dataloader에서 batch로 바로 못묶이고 에러가남.-> 함수를 직접 작성해서 넘겨줘야함.
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

#4. 모델 선언하기


In [None]:

bert_model=BertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
#빠른 학습을 위해 gpu로 사용하고 모델을 gpu에 업로드
bert_model.to(device) #bert의 마지막단이 classifier이므로 굳이 모델 새로 생성할 필요없음

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# 5. 모델 훈련

In [None]:
#wandb사용하기 위해 초기화 + 이름 설정
wandb.init(project='test-project',entity='goorm')
wandb.run.name='mycode_functions'

VBox(children=(Label(value='0.001 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.261565…

In [None]:
#정확도 계산
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [None]:
def train(model, train_loader,learning_rate,epochs):
  train_losses, train_hypothesis, train_target_labels=[], [], []
  optimizer = AdamW(bert_model.parameters(), lr=config['learning_rate'])#transformer 패키지에서 나온 adamW
  
  for epoch in range(epochs):
      model.train()
      for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(train_loader, unit="batch"):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)
        labels = labels.to(device, dtype=torch.long)

        optimizer.zero_grad()#optimizer 미분값 0 으로 초기화

        hypothesis = bert_model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            labels=labels)

        loss = hypothesis.loss#transformers.modeling_outputs.SequenceClassifierOutput의 파라미터
        train_losses.append(loss)

        logits=hypothesis.logits#[음일 확률,양일 확률]
       
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        batch_labels = [int(example) for example in labels]

        train_hypothesis += batch_predictions
        train_target_labels += batch_labels

        loss.backward()
        optimizer.step()

      train_acc = compute_acc(train_hypothesis,train_target_labels)
      train_loss=sum(train_losses)/len(train_losses)
      wandb.log({'train_loss':train_loss,'train_acc':train_acc})
      print(f'epoch:{epoch+1} train loss: {train_loss}')#epoch마다 로그 출력

      validation(model,dev_loader)#training 이후 validation 수행
  wandb.run.save()


In [None]:
def validation(model,dev_loader):
  lowest_valid_loss=9999999
  valid_losses, valid_hypothesis, valid_target_labels=[], [], []
  with torch.no_grad():
    model.eval()#평가모드로 전환
    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader, unit="batch"):
      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)
      token_type_ids = token_type_ids.to(device)
      position_ids = position_ids.to(device)
      labels = labels.to(device, dtype=torch.long)

      hypothesis=bert_model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids, position_ids=position_ids,labels=labels)
      
      #https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.SequenceClassifierOutput
      logits=hypothesis.logits#[음일확률,양일확률] transformers.modeling_outputs.SequenceClassifierOutput의 파라미터
      loss=hypothesis.loss#transformers.modeling_outputs.SequenceClassifierOutput의 파라미터
      valid_losses.append(loss)

      batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
      batch_labels = [int(example) for example in labels]

      valid_hypothesis += batch_predictions
      valid_target_labels += batch_labels
      
    acc = compute_acc(valid_hypothesis, valid_target_labels)
    valid_loss = sum(valid_losses) / len(valid_losses)
    wandb.log({'val_loss':valid_loss,'val acc':acc})
    print(f'val loss: {valid_loss}, val acc:{acc}')

    if lowest_valid_loss > valid_loss:
      print('Acc for model which have lower valid loss: ', valid_loss)
      torch.save(bert_model.state_dict(), "/pytorch_model.bin")
  
  wandb.run.save()

In [None]:
train(bert_model, train_loader, config['learning_rate'],config['epochs'])

100%|██████████| 13852/13852 [31:33<00:00,  7.31batch/s]


epoch:1 train loss: 0.07276296615600586


100%|██████████| 63/63 [00:05<00:00, 11.76batch/s]


val loss: 0.05519659444689751, val acc:0.9805
Acc for model which have lower valid loss:  0.9805




#6. 모델 테스트파일 내보내기

In [None]:
import pandas as pd
test_df=pd.read_csv('test_no_label.csv')#문장 데이터만 있는 csv파일 가져오기

In [None]:
test_df#문장(리뷰)데이터만 있는 csv파일

Unnamed: 0,Id
0,it 's a whole new experience and new flavors e...
1,so disappointing from an old favorite .
2,it is the most authentic thai in the valley .
3,do not sign a lease with these people .
4,i was nervous and she made me feel so comforta...
...,...
995,the food here is delicious .
996,we 'll certainly be back !
997,the building itself looks abandoned .
998,if i could give zero stars i def would .


In [None]:
test_data=test_df['Id']#id만 가져와서 사용할 것임

In [None]:
test_data

0      it 's a whole new experience and new flavors e...
1                so disappointing from an old favorite .
2          it is the most authentic thai in the valley .
3                do not sign a lease with these people .
4      i was nervous and she made me feel so comforta...
                             ...                        
995                         the food here is delicious .
996                           we 'll certainly be back !
997                the building itself looks abandoned .
998             if i could give zero stars i def would .
999    the beer is n't bad , but the food was less th...
Name: Id, Length: 1000, dtype: object

In [None]:
#수치화하기
def make_id(tokenizer,data):
  id_file=[]
  for sentence in data:#train/dev data와 달리 column이름 필요없음
    id_file.append(tokenizer.encode(sentence))
  return id_file
  

In [None]:
test_id=make_id(tokenizer,test_data) #문장->수치

In [None]:
class SentimentTestDataset(object):
    def __init__(self,dataframe_ids):
        self.dataframe_ids=dataframe_ids

    def __len__(self):
        return len(self.dataframe_ids)

    def __getitem__(self, index):
        return np.array(self.dataframe_ids[index])#training/dev와 달리 label이 없음

In [None]:
test_dataset=SentimentTestDataset(test_id) #데이터셋 생성

In [None]:
test_dataset

<__main__.SentimentTestDataset at 0x7ff0fd862c10>

In [None]:
def collate_fn_style_test(samples):
    input_ids_ = samples
    max_len = max(len(input_id) for input_id in input_ids_)
    sorted_indices = range(len(input_ids_))

    input_ids = pad_sequence([torch.tensor(input_ids_[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask= torch.tensor(
        [[1] * len(input_ids_[index]) + [0] * (max_len - len(input_ids_[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 32
#데이터 로더 생성
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
bert_model.eval()    # 테스트 할 때도 검증 모드
with torch.no_grad():
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = bert_model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits#[0일 확률,1일 확률]
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]    # 똑같음 긍정에 가깝니 부정에 가깝니
        predictions += batch_predictions

100%|██████████| 32/32 [00:02<00:00, 13.80it/s]


In [None]:
test_df['Category'] = predictions   # 예측값을 테스트 데이터 프레임에 붙입니다!

In [None]:
test_df.to_csv('mycode_submission_2.csv', index=False)

In [None]:
test_df

Unnamed: 0,Id,Category
0,it 's a whole new experience and new flavors e...,1
1,so disappointing from an old favorite .,0
2,it is the most authentic thai in the valley .,1
3,do not sign a lease with these people .,0
4,i was nervous and she made me feel so comforta...,1
...,...,...
995,the food here is delicious .,1
996,we 'll certainly be back !,1
997,the building itself looks abandoned .,0
998,if i could give zero stars i def would .,0
