

```
# 영구 설치한 라이브러리 불러오기
import os, sys  
from google.colab import drive  
drive.mount('/content/drive')   

pg_path = '/content/packages'  

os.symlink('/content/drive/My Drive/Colab Notebooks', pg_path)  
sys.path.insert(0, pg_path)
```



In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import logging
logging.set_verbosity_error() # 경고문 안 띄우기

In [3]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

In [4]:
# 데이터 경로 설정하고 읽어옴
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Watson/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Watson/data/test.csv")
train

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1
...,...,...,...,...,...,...
12115,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,en,English,2
12116,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,en,English,0
12117,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",en,English,2
12118,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,en,English,2


In [5]:
# 영어 빼고 제거
train = train[train['lang_abv'] == 'en']
test = test[test['lang_abv'] == 'en']
train

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
7,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,English,2
8,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,en,English,1
...,...,...,...,...,...,...
12115,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,en,English,2
12116,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,en,English,0
12117,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",en,English,2
12118,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,en,English,2


In [6]:
# 불필요한 열 제거
train = train.drop(columns=['lang_abv', 'language'])
test = test.drop(columns=['lang_abv', 'language'])
train

Unnamed: 0,id,premise,hypothesis,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,2
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,0
7,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,2
8,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,1
...,...,...,...,...
12115,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,2
12116,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,0
12117,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",2
12118,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,2


In [7]:
# 빈 인덱스 삭제
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train

Unnamed: 0,id,premise,hypothesis,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,2
2,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,0
3,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,2
4,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,1
...,...,...,...,...
6865,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,2
6866,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,0
6867,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",2
6868,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,2


In [8]:
test

Unnamed: 0,id,premise,hypothesis
0,aa2510d454,His family had lost a son and a daughter now.,The son and daughter had lost their father.
1,865d1c7b16,Steps are initiated to allow program board mem...,There's enough room for 35-40 positions on the...
2,6d9fa191e6,"agencies' operating trust, enterprise and inte...",Agencies in financial trouble are usually audi...
3,f11f1ffffe,how long has he been in his present position,What length of time has he held the current po...
4,40a9b0f08e,Research and development is composed of,R&D is made up of.
...,...,...,...
2940,7618dd5a26,"However, in the off-field (sentimental) tourna...",The Jets have the most appealing story line.
2941,3189b0ae29,see now in a situation like that the boys are ...,Everyone involved was the same age.
2942,f357a04e86,The rock has a soft texture and can be bough...,The rock is harder than most types of rock.
2943,0407b48afb,isn't it i can remember i've only been here ei...,I could see downtown Dallas from where I lived...


In [9]:
# 결측치 확인
train.isna().sum().sum()
test.isna().sum().sum()

0

In [10]:
# train이랑 val 나누기
from sklearn.model_selection import train_test_split

# train과 validation 데이터셋으로 분리
train, val = train_test_split(
    train, test_size=0.2, random_state=42)
print('train:\n', train)
print('\n\nval:\n', val)

train:
               id                                            premise  \
252   b6e01c1a07  Also, the Holy Family are said to have shelter...   
2705  b8fa1a0044  Participants generally viewed the new internal...   
4442  c828f51ef6  With a little practice almost anyone can flip ...   
4912  b2c98d5a99  More reserved and remote but a better administ...   
6194  9cd35fee05  The company later told us that it had disconti...   
...          ...                                                ...   
3772  b2e8cf0604  The ITC has enlisted legal services attorneys ...   
5191  9a25eec056  In the stock market, however, the damage can g...   
5226  b50116370a  The recommendation comes from the court's Task...   
5390  cb0cdc2407  For centuries, the Loire river was a vital hig...   
860   b4bd9114b8  Around 1500 b.c. , a massive volcanic eruption...   

                                             hypothesis  label  
252   The Holy family spent a total of three days here.      1  
2705  Tho

In [11]:
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [12]:
val

Unnamed: 0,id,premise,hypothesis,label
0,a937bd90a8,"3) Dare you rise to the occasion, like Raskol...",Would you rise up and defeaat all evil lords i...,1
1,0abb9434d5,"He married Dona Filipa Moniz (Perestrelo), the...",He landed on the island but soon left for gree...,2
2,bc52edfd5b,The contrast between the landscape of the cent...,There was a beautiful artist who painted the l...,1
3,62344708db,i don't know i i do i can think of all the uh ...,I know this because I own a bible.,1
4,03f3c9a92e,'So I assume he hacked into the autopilot and ...,I'm assuming he hacked the autopilot to bring ...,1
...,...,...,...,...
1369,52addde9aa,GAO's recommendations are intended to improve ...,The GAO works for the benefit of the American ...,0
1370,97607fbf89,My last afternoon in Louisian was supposed to ...,My last day in Louisian was very exciting.,2
1371,e9e9ff5992,oh hum well uh i haven't for some reason have ...,I am very excited about football in the summer.,2
1372,a8b91666b6,"Personal Communication with P. Croteau, Babcoc...","In August 2001, there was personal communicati...",1


In [13]:
# 지명?이 있어서 cased 선택
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)

In [14]:
class WatsonDataset(Dataset):
    def __init__(self, data_df, tokenizer):
        self.data = data_df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        premise = self.data.loc[index, 'premise']
        hypothesis = self.data.loc[index, 'hypothesis']
        label = self.data.loc[index, 'label']
        encoding = self.tokenizer.encode_plus(premise, hypothesis, add_special_tokens=True, 
                                               padding='max_length', truncation=True, max_length=256, 
                                               return_attention_mask=True, return_tensors='pt')
        # add_special_tokens=True : 앞뒤에 [CLS], [SEP] 추가
        return {'input_ids': encoding['input_ids'].squeeze(), 
                'attention_mask': encoding['attention_mask'].squeeze(),
                'label': torch.tensor(label, dtype=torch.long)}


In [15]:
train_dataset = WatsonDataset(train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = WatsonDataset(val, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=32)

In [16]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fc1012955e0>

In [17]:
torch.cuda.is_available()

True

In [18]:
# gpu 사용
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
# 아담 최적화
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
# 크로스엔트로피
criterion = torch.nn.CrossEntropyLoss()

In [20]:
for epoch in range(5):
    # 한 에포크당 실행
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], 
                        labels=batch['label'])
        loss = criterion(outputs.logits, batch['label'])
        loss.backward()
        optimizer.step()

    # 평가
    model.eval()
    with torch.no_grad():
        val_loss = 0
        val_acc = 0
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], 
                            labels=batch['label'])
            val_loss += criterion(outputs.logits, batch['label']).item()
            val_acc += (outputs.logits.argmax(dim=-1) == batch['label']).sum().item()
        val_loss /= len(val_dataloader.dataset)
        val_acc /= len(val_dataloader.dataset)
        print(f'Epoch {epoch+1}: Validation Loss = {val_loss:.4f}, Validation Accuracy = {val_acc:.4f}')

Epoch 1: Validation Loss = 0.0274, Validation Accuracy = 0.6150
Epoch 2: Validation Loss = 0.0244, Validation Accuracy = 0.6630
Epoch 3: Validation Loss = 0.0285, Validation Accuracy = 0.6659
Epoch 4: Validation Loss = 0.0356, Validation Accuracy = 0.6536
Epoch 5: Validation Loss = 0.0403, Validation Accuracy = 0.6594
