<a href="https://colab.research.google.com/github/zhaocaiQ/machine_learning_study/blob/master/%ED%8C%8C%EC%9D%B4%ED%86%A0%EC%B9%98__LSTM_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.legacy import data,datasets
import random

In [44]:
SRC = data.Field(tokenize = 'spacy', lower = True)
TRG = data.LabelField(dtype = torch.int64)

train_data, test_data = datasets.IMDB.splits(SRC, TRG) # download imdb dataset

In [45]:
# display single example at index 0
print(vars(train_data.examples[0]))

{'text': ['i', 'always', 'enjoyed', 'watching', 'this', 'when', 'it', 'came', 'on', 'television', 'during', 'prime', '-', 'time', 'every', 'year', 'in', 'the', '60', "'s", '.', 'it', "'s", 'a', 'typical', 'hollywood', 'history', 'epic', ',', 'dramatized', ',', 'stylized', 'and', 'full', 'of', 'inaccuracies', 'but', 'so', 'what', ',', 'it', "'s", 'an', 'entertaining', 'movie', 'and', 'a', 'good', 'looking', 'film', '.', 'cecil', 'b.', 'demille', 'at', 'the', 'end', 'of', 'his', 'life', 'is', 'the', 'executive', 'producer', 'of', 'this', 'remake', 'of', 'his', '1938', 'film', '.', 'his', 'son', '-', 'in', '-', 'law', 'actor', 'anthony', 'quinn', 'who', 'had', 'the', 'supporting', 'role', 'of', 'beluche', 'in', 'the', "'", '38', 'film', 'is', 'the', 'director', 'in', 'his', 'directorial', 'debut', 'and', 'swan', 'song', 'as', 'he', 'had', 'never', 'directed', 'a', 'film', 'before', 'and', 'never', 'would', 'again', '.', 'demille', 'assembled', 'a', 'crew', 'who', 'had', 'recently', 'worke

In [46]:
# Build vocabulary for source and target from training data

SRC.build_vocab(train_data, max_size=10000, min_freq=5, vectors="glove.6B.100d")  # using pretrained word embedding
TRG.build_vocab(train_data, min_freq = 5)

print(vars(TRG.vocab))
print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in TRG vocabulary: {len(TRG.vocab)}")

{'freqs': Counter({'pos': 12500, 'neg': 12500}), 'itos': ['neg', 'pos'], 'unk_index': None, 'stoi': defaultdict(None, {'neg': 0, 'pos': 1}), 'vectors': None}
Unique tokens in source vocabulary: 10002
Unique tokens in TRG vocabulary: 2


In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 100

# train and test iteartor
train_iterator,test_iterator = data.BucketIterator.splits(
      (train_data, test_data), 
      batch_size = BATCH_SIZE, 
      device = device
    )

In [49]:
# Model class
class Model(nn.Module):
  def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
    # input_dim <--- vocabulary size
    # output_dim <--- len ([positive, negative]) == 2 
    # emb_dim <--- embedding dimension of embedding matrix
    
    super(Model, self).__init__()
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
    
    self.fc1 = nn.Linear(hidden_dim, hidden_dim//2)
    self.fc2 = nn.Linear(hidden_dim//2, output_dim)
    
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # shape: [source_len, batch_size]
    embedded = self.dropout(self.embedding(src)) # sahpe: [src_len, batch_size, embed_dim]
    output, (hidden, cell) = self.rnn(embedded) 
    # output shape -> [batch, hidden_dim]
    # hiddden shape -> [n_layers, batch, hidden_dim]
    # cell shape -> [n_layers, batch, hidden_dim]
    output = self.fc1(output[-1])
    output = self.fc2(self.relu(output))
    return output

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#initializing variables and hyper parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)

# loading pretrained word embedding
model.embedding.weight.data.copy_(TEXT.vocab.vectors) 

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1915, -0.2686,  0.0245,  ..., -0.4086, -0.5865,  0.0474],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [51]:
optimizer = optim.Adam(model.parameters(), lr=3e-3)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

criterion = nn.CrossEntropyLoss()


# Model training function
def train(model, iterator, optimizer=optimizer, criterion=criterion, clip=1):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    
    for i, batch in enumerate(iterator):
        src = batch.text.to(device)
        trg = batch.label.to(device)
        optimizer.zero_grad()
        output = model(src)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    print(f'correct: {total_correct}/{total_count}')
    mean_loss = epoch_loss / len(iterator)
    scheduler.step(mean_loss)
    return mean_loss # mean loss

In [None]:
# loop and train our model
total_epoch = 120
for epoch in range(total_epoch):
  result = train(model=model, iterator=train_iterator)
  print(f'Epoch {epoch} -->', result)

correct: 12413/25000
Epoch 0 --> 0.7105521597862243
correct: 12536/25000
Epoch 1 --> 0.6947320268154145
correct: 12634/25000
Epoch 2 --> 0.694028380393982
correct: 12366/25000
Epoch 3 --> 0.6939971520900726


In [1]:
# function to experiment movie review sentences
import spacy

!python -m spacy download en # dwonload english from spacy

sp = spacy.load('en')


def predict(sentence):

  if type(sentence) == str:
    tokanized_sentence = [word.text for word in sp.tokenizer(sentence)]
  else:
    tokanized_sentence = sentence


  input_data = [SRC.vocab.stoi[word.lower()] for word in tokanized_sentence]
  input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)


  model.eval()
  output = model(input_data)
  # print(output)
  predict = output.argmax(1)
  predict = predict.squeeze(0)
  print(output)

  if predict>0:
    return "---->> Positive Review"
  else:
    return '---->> Negative Review'

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
predict('i have enjoyed this movie') # predict funciton will predict if this is positive or negative review.

# 검증세트까지

In [33]:
optimizer = optim.Adam(model.parameters(), lr=3e-3)

# defining learnig rate scheduler (optional)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

criterion = nn.CrossEntropyLoss()


# Model training function
def train(model, iterator, optimizer=optimizer, criterion=criterion, clip=1):
    model.train()
    for b, batch in enumerate(iterator):
      x, y = batch.text.to(device), batch.label.to(device)
      y.data.sub_(1)  # 레이블 값을 0과 1로 변환
      optimizer.zero_grad()

      logit = model(x)
      loss = criterion(output, y)
      loss.backward()
      optimizer.step()


def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1) # 레이블 값을 0과 1로 변환
        logit = model(x)
        loss = criterion(output, y, reduction='sum')
        total_loss += loss.item()
        corrects += (output.argmax(1).view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [36]:
EPOCHS = 10
best_val_loss = None
for e in range(1, EPOCHS+1):
  train(model=model, iterator=train_iter)
  val_loss, val_accuracy = evaluate(model, val_iter)

  print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (e, val_loss, val_accuracy))

  # 검증 오차가 가장 적은 최적의 모델을 저장
  if not best_val_loss or val_loss < best_val_loss:
      if not os.path.isdir("snapshot"):
          os.makedirs("snapshot")
      torch.save(model.state_dict(), './snapshot/txtclassification.pt')
      best_val_loss = val_loss

NameError: ignored

In [None]:
# function to experiment movie review sentences
import spacy

!python -m spacy download en # dwonload english from spacy

sp = spacy.load('en')


def predict(sentence):

  if type(sentence) == str:
    tokanized_sentence = [word.text for word in sp.tokenizer(sentence)]
  else:
    tokanized_sentence = sentence


  input_data = [SRC.vocab.stoi[word.lower()] for word in tokanized_sentence]
  input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)


  model.eval()
  output = model(input_data)
  # print(output)
  predict = output.argmax(1)
  predict = predict.squeeze(0)
  print(output)

  if predict>0:
    return "---->> Positive Review"
  else:
    return '---->> Negative Review'

In [None]:
predict('i have enjoyed this movie') # predict funciton will predict if this is positive or negative review.