In [1]:
# GPU 가능하다면 GPU 활용 ( 런타임-> 런타임 유형 변경 -> 가속기 -> GPU )

In [2]:
!python -V

Python 3.7.13


In [3]:
# 혹시 버전일치 안될 때
!apt install python3.7

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3.7 is already the newest version (3.7.13-1+bionic3).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [4]:
!pip install -U torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 12.0 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.0
    Uninstalling torchtext-0.13.0:
      Successfully uninstalled torchtext-0.13.0
Successfully installed sentencepiece-0.1.96 torchtext-0.6.0


In [5]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 1.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import torch
from torchtext import data # Field 불러온 이 양식은 torchtext 0.6 버전에 맞춘 것

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language='en_core_web_sm')
LABEL = data.LabelField(dtype=torch.float) # pos -> 1 : neg -> 0

In [7]:
# IMDB 불러오기

from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 34.2MB/s]


In [8]:
print(f'train_data : {len(train_data)}')
print(f'test_data : {len(test_data)}')

train_data : 25000
test_data : 25000


In [9]:
print(vars(train_data.examples[0]))

{'text': ['This', 'was', 'an', 'adorable', 'movie', '.', 'A', 'real', 'feel', '-', 'good', 'movie', 'when', 'you', 'need', 'one', '.', 'The', 'story', 'is', 'light', '(', 'this', 'is', 'no', 'Gone', 'With', 'the', 'Wind', ')', 'but', 'sometimes', ',', 'one', 'needs', 'this', 'kind', 'of', 'plot', '.', 'Funny', 'and', 'warm', 'characters', ',', 'fantastic', 'acting', 'and', 'beautiful', 'costumes', '/', 'wardrobe.<br', '/><br', '/>Parminder', 'K.', 'Nagra', '(', 'also', 'from', 'the', 'TV', 'show', 'ER', ')', 'is', 'WONDERFUL', 'in', 'this', 'role', '.', 'She', 'is', 'definitely', 'a', 'new', 'shining', 'star', 'for', 'Hollywood', '.', 'All', 'should', 'keep', 'an', 'eye', 'on', 'her', ',', 'she', "'s", 'going', 'to', 'be', 'BIG', 'in', 'the', 'future.<br', '/><br', '/>Also', 'impressing', 'was', 'the', 'soundtrack', 'for', 'this', 'movie', '.', 'A', 'nice', 'mix', 'of', 'modern', 'and', 'Indian', 'tunes', '.', 'I', 'was', 'dancing', 'throughout', 'most', 'of', 'the', 'movie.<br', '/><b

In [10]:
# valid data를 train data 중 일부로 분리해보려 함
import random
SEED = 1234

torch.manual_seed(SEED)
# 컴퓨팅 cudnn에 해당
torch.backends.cudnn.deterministic = True

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [11]:
print(f'train_data : {len(train_data)}')
print(f'valid_data : {len(valid_data)}')
print(f'test_data : {len(test_data)}')

train_data : 17500
valid_data : 7500
test_data : 25000


In [12]:
# Vocab  빌드하기

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, min_freq=5)
LABEL.build_vocab(train_data)

In [13]:
# 25002인 이유 : <unk>, <pad>   두 토큰 때문. <pad> 문장의 길이를 맞추는 token
print(f'Unique tokens in TEXT vocab: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocab: {len(LABEL.vocab)}')

Unique tokens in TEXT vocab: 25002
Unique tokens in LABEL vocab: 2


In [14]:
print(f'가장 자주 나오는 단어들 10개 in TEXT: \n{TEXT.vocab.freqs.most_common(10)}\n')

가장 자주 나오는 단어들 10개 in TEXT: 
[('the', 201623), (',', 191855), ('.', 165297), ('a', 109153), ('and', 108995), ('of', 100435), ('to', 93378), ('is', 75991), ('in', 61082), ('I', 54034)]



In [15]:
print(TEXT.vocab.itos[:3]) # int to string
print(LABEL.vocab.stoi) # string to int

['<unk>', '<pad>', 'the']
defaultdict(None, {'neg': 0, 'pos': 1})


In [16]:
# interator

print(torch.__version__)

1.12.0+cu113


In [17]:
# if 'gpu 지원 되면' else'cpu'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device
)

In [19]:
# iterator활용
for i, batch in enumerate(train_iterator):
    text = batch.text
    label = batch.label

    #print(f'배치의 text 크기: {text.shape}')
    #print('text[3]', text[3])
    #print(f'배치의 label 크기: {label.shape}')
    #print('label', label)


In [20]:
# 모델 build

# Embedding layer:   input -> 숫자로된 vector로 매핑하는데 look-up table
# 신경망에서 가중치가 업데이트 되는 방식처럼  역전파 때 학습

# RNN layer (함수 정의)
# RNN  문장 속에 단어들을 한번에 하나씩 계산하여 각 단어당 hidden state(h)를 도출
# h_t = RNN (x_t, h_(t-1))
# 마지막 hidden state   h_T 를 linear layer에 통과시키면   최종 prediction   y_hat = f(h_T)

In [21]:
import torch.nn as nn

In [22]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
      super().__init__()

      self.embedding = nn.Embedding(input_dim, embedding_dim)
      self.rnn = nn.RNN(embedding_dim, hidden_dim)
      self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
      # text = [sentence length, batch size]
      embedded = self.embedding(text)

      # embedded = [sentence length, batch size, embedding dim]

      output, hidden = self.rnn(embedded)

      # output = [sentence length, batch size, hidden dim]
      # hidden = [1, batch size, hidden dim]

      return self.fc(hidden.squeeze(0))

 -input_dim: 단어 사이즈
 -embedding_dim: 보통 50-250 차원
 -hidden_dim: 대개 100-500 차원
 -output_dim: class 수,  0 vs 1   1차원

In [23]:
INPUT_DIM = len(TEXT.vocab) # 25002
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [24]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Trainable Param num : {count_parameters(model):,}')

Trainable Param num : 2,592,105


In [25]:
# Training Model

In [26]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [27]:
# loss function    binary crossentropy with logits
criterion = nn.BCEWithLogitsLoss()

In [28]:
# GPU
model = model.to(device)
criterion = criterion.to(device)

In [29]:
# accuracy func
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  # rounded_preds : [batch size]
  # y : batch.label

  correct = (rounded_preds == y).float() # 정답지 true/false
  acc = correct.sum() / len(correct)
  return acc

In [30]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train() # model을 train 모드로 전환. dropout이나 batch normalization 가능해짐

  for batch in iterator:
    # batch마다 gradient를 0으로 초기화
    optimizer.zero_grad()

    # 문자의 batch -> batch.text 이것을 model에 입력 -> forward 수행
    # predictions의 크기가 [batch size, 1] 이므로,  스퀴즈 해서 [batch size]로 shape 변경
    predictions = model(batch.text).squeeze(1)

    # acc, loss
    acc = binary_accuracy(predictions, batch.label)

    loss = criterion(predictions, batch.label)
    # backward() 사용하여 역전파
    loss.backward()

    # 최적화알고리즘으로 parameter 업데이트
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
# evaluate

def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  # evalute mode
  model.eval()

  # pytorch가 평가시점엔 gradient 계산 안해도 될때이므로 멈춰서 memory 아끼고, 계산 속도 높도록
  with torch.no_grad():
    for batch in iterator:
        predictions = model(batch.text).squeeze(1)

        # acc, loss
        acc = binary_accuracy(predictions, batch.label)

        loss = criterion(predictions, batch.label)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [32]:
import time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [33]:
# learning  through epochs

In [35]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'tut1-model.pt')

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.694 | Train Acc: 49.79%
	Valid Loss: 0.696 | Valid Acc: 49.61%
Epoch: 02 | Epoch Time: 0m 9s
	Train Loss: 0.693 | Train Acc: 50.18%
	Valid Loss: 0.696 | Valid Acc: 50.78%
Epoch: 03 | Epoch Time: 0m 9s
	Train Loss: 0.693 | Train Acc: 49.71%
	Valid Loss: 0.696 | Valid Acc: 49.61%
Epoch: 04 | Epoch Time: 0m 9s
	Train Loss: 0.693 | Train Acc: 50.16%
	Valid Loss: 0.696 | Valid Acc: 50.93%
Epoch: 05 | Epoch Time: 0m 9s
	Train Loss: 0.693 | Train Acc: 50.08%
	Valid Loss: 0.696 | Valid Acc: 49.70%


In [36]:
model.load_state_dict(torch.load('tut1-model.pt'))

<All keys matched successfully>

In [37]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'\ttest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


	test Loss: 0.709 | Test Acc: 47.33%
