In [1]:
from torchtext.datasets import IMDB

train_iter = IMDB(split='train')
test_iter = IMDB(split='test')

In [2]:
import random
random.seed(6)

# train_iter를 리스트 타입으로 변경
train_lists = list(train_iter)
test_lists = list(test_iter)

# 각기 1000개씩 랜덤 샘플링
train_lists_small = random.sample(train_lists, 1000)
test_lists_small = random.sample(test_lists, 1000)

# 각 변수에 담긴 인덱스 0에 해당하는 원소, 즉 첫 번째 원소 출력
print(train_lists_small[0])
print(test_lists_small[0])

(2, "I LOVED this movie! I am biased seeing as I am a huge Disney fan, but I really enjoyed myself. The action takes off running in the beginning of the film and just keeps going! This is a bit of a departure for Disney, they don't spend quite as much time on character development (my husband pointed this out)and there are no musical numbers. It is strictly action adventure. I thoroughly enjoyed it and recommend it to anyone who loves Disney, be they young or old.")
(1, 'This was an abysmal show. In short it was about this kid called Doug who guilt-tripped a lot. Seriously he could feel guilty over killing a fly then feeling guilty over feeling guilty for killing the fly and so forth. The animation was grating and unpleasant and the jokes cheap. <br /><br />It aired here in Sweden as a part of the "Disney time" show and i remember liking it some what but then i turned 13.<br /><br />I never got why some of the characters were green and purple too. What was up with that? <br /><br />Tru

In [3]:
train_texts = []
train_labels = []

In [4]:
for label, text in train_lists_small:
  # IMDB 데이터의 기준 레이블 2를 1 변경, 기본 레이블 1을 0으로 변경
  train_labels.append(1 if label==2 else 0)
  train_texts.append(text)

# test도 동일시
test_texts = []
test_labels = []

for label, text in test_lists_small:
  # IMDB 데이터의 기준 레이블 2를 1 변경, 기본 레이블 1을 0으로 변경
  test_labels.append(1 if label==2 else 0)
  test_texts.append(text)
  

print(train_texts[0])
print(train_labels[0])
print(test_texts[0])
print(test_labels[0])

I LOVED this movie! I am biased seeing as I am a huge Disney fan, but I really enjoyed myself. The action takes off running in the beginning of the film and just keeps going! This is a bit of a departure for Disney, they don't spend quite as much time on character development (my husband pointed this out)and there are no musical numbers. It is strictly action adventure. I thoroughly enjoyed it and recommend it to anyone who loves Disney, be they young or old.
1
This was an abysmal show. In short it was about this kid called Doug who guilt-tripped a lot. Seriously he could feel guilty over killing a fly then feeling guilty over feeling guilty for killing the fly and so forth. The animation was grating and unpleasant and the jokes cheap. <br /><br />It aired here in Sweden as a part of the "Disney time" show and i remember liking it some what but then i turned 13.<br /><br />I never got why some of the characters were green and purple too. What was up with that? <br /><br />Truly a horri

In [5]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2, random_state=3)

print(len(train_texts))
print(len(train_labels))
print(len(val_texts))
print(len(val_labels))

800
800
200
200


In [6]:
# distilbert-base-uncased 모델에서 토크나이저 불러오기
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# 토크나이저 실행
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True) # truncation은 모델의 디폴트 max_length를 넘는 입력 부분은 더 이상 받지 않고 절단

# 0번째 입력문(텍스트)의 5번째 토큰까지의 input_ids 출력
print(train_encodings["input_ids"][0][:5])

# 위의 결과를 디코딩하여 출력
print(tokenizer.decode(train_encodings["input_ids"][0][:5]))

[101, 4937, 11350, 2038, 2048]
[CLS] cat soup has two


In [8]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
  
  # 생성자 __init__()
  # 자신을 가리키는 매개변수 self 포함
  # 변수를 저장하기 위해 self.변수명을 사용
  def __init__(self, encodings, labels):
    self.encodings = encodings 
    self.labels = labels
  
  # 자신을 가리키는 매개변수 self 포함
  def __getitem__(self, idx):
    # self.encoding에 담긴 키(key)와 키값(value)을 items()로 추출
    # 이 값을 key와 val 변수에 담아 새로운 키(key)와
    # 키값을 갖는 딕셔너리 생성
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  
  # 자신을 가리키는 매개변수 self 포함
  def __len__(self):
    return len(self.labels)
  
train_dataset = IMDbDataset(train_encodings, train_labels)

In [13]:
def test_inference(model, tokenizer):
    input_tokens = tokenizer(["I feel fantastic", "My life is going something wrong", "I have not figured out what the chosen title has to do with the movie"], truncation=True, padding=True)

    outputs = model(torch.tensor(input_tokens['input_ids']).to('mps'))
    label_dict = {1: 'positive', 0: 'negative'}

    return [label_dict[i] for i in torch.argmax(outputs['logits'], axis=1).cpu().numpy()]

In [16]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to('mps')

print(test_inference(model, tokenizer))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

model.train()

losses = []

for epoch in range(8):
    print(f'epoch:{epoch}')
    for batch in train_loader:
        optim.zero_grad()

        input_ids = batch['input_ids'].to('mps')
        attention_mask = batch['attention_mask'].to('mps')
        labels = batch['labels'].to('mps')

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        losses.append(loss)

        loss.backward()

        optim.step()
model.eval()

print(test_inference(model, tokenizer))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['positive', 'positive', 'positive']
epoch:0




epoch:1
epoch:2
epoch:3
epoch:4
epoch:5
epoch:6
epoch:7
['positive', 'negative', 'negative']


In [17]:
model.eval()

l = []

for test_text in test_texts:

    input_tokens = tokenizer([test_text], truncation=True, padding=True)

    outputs = model(torch.tensor(input_tokens['input_ids']).to('mps'))

    l.append(torch.argmax(outputs['logits'], axis=1).item())

correct_cnt= 0

for pred, ans in zip(l, test_labels):
    if pred == ans:
        correct_cnt += 1
print(correct_cnt/len(test_labels))

0.845
