1.1 fully connected layer 학습

RNN과 LSTM 모델을 학습하기에 앞서 ANN( Fully connected layer로 구성

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable

# init에 model 껍데기 생성

class ANN(nn.Module):
  def __init__(self, num_output, input_size, hidden_size, device):
    super(ANN, self).__init__()
    self.device = device

    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.outlayer = nn.Linear(hidden_size, num_output)

# forward propagation

  def forward(self, x):
    h = self.fc1(x).relu()
    h = self.fc2(x).relu()
    predict = self.outlayer(h)
    return predict
    

1.2 LSTM for NLP

LSTM을 toch로 구현 word2vec을 사용해도 되지만 nn.Embedding 레이어를 사용해서 정수 인코딩 결가를 word2vec으로 만들어주는 레이어를 사용

In [8]:

class LSTM_net(nn.Module):
 # 초기화
 # input layer에는 embedding, 정수 인코딩된 결과들이 들어감 
  def __init__(self, num_output, hidden_size, size_vocab, dim_embed, linear_size, num_layers, device):
    super(LSTM_net, self).__init__()
    self.device = device #GPU
    self.num_output = num_output
    self.hidden_size = hidden_size
    self.num_layers = num_layers
   # embeddding layer : 단어갯수, 나타내고 싶은 차원수 
    self.embed = nn.Embedding(size_vocab, dim_embed)

    self.lstm = nn.LSTM(input_size = dim_embed, hidden_size = hidden_size,
                        num_layers = num_layers, dropout = 0.3, bidrectional = True)
    # 분류를 위한 FCL 추가 
    self.fclayer = nn.Linear(hidden_size, linear_size)
    self.outlayer = nn.Linear(linear_size, num_output) #이때의 분류는 스팸 ox이기 때문에 1이 됨

  def forward(self,x):
    scaler = 2 if self.lstm.biderectional == True else 1
  #x : 정수 인코딩된 결과 [batch_size, seq_len] emb : word2vec[batch, seq_len. dim_emb]
    emb = self.embed(x)
  # 오른쪽 왼쪽 양쪽에서 출발하는 biderectional이기 때문에 layer * scaler
    h_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                   self.hidden_size, required_grad = True)).to(self.device)
    c_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                   self.hidden_size, required_grad = True)).to(self.device)
    # lstm에서는 seq_len이 맨 앞으로 들어가야함으로 transpose사용 
    # lstm_out : out, h : hidden, c: cell  
    lstm_out, (h , c) = self.lstm(emb.transpose(1,0), (h_state, c_state))
    h = h[-1] #마지막 time의 hidden만 가져오기
    h = self.fclayer(h).relu()
    predict = self.outlayer(h)
    return predict     

1.3 데이터 전처리

토큰화 -> 정제, 추출 -> 정수 인코딩 


In [None]:
import os
import pandas as pd
data = pd.read_csv('emails.csv')
display(data.info(), data.head())

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
stop_words = set(stopwords.words('english'))
data = data.dropna().reset_index(drop=True)
token_text = []
for i in range(5728):
  token = word_tokenize(data.iloc[i,0])
  token_stop_text = []
  for w in token:
    if w not in stop_words:
      token_stop_text.append(w)
  token_text.append(token_stop_text)
print('after cleaning: ', len(token_text))

정수 인코딩

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_text)
print(len(tokenizer.word_index))

In [None]:
# 정수 인코딩
text_encoded = tokenizer.text_to_sequence(token_text)
print(text_encoded[0])

In [None]:
# 길이가 다른 문장을 통일하기 위해 padding 진행 
#최장길이보고 일정 길이 이상은 버리고, 이하면 0으로 padding 진행하기

print(np.shape(text_encoded))
print(np.shape(text_label))
maxlen = 0
for w in text_encoded:
  if len(w) >= maxlen:
    maxlen = len(w)
print(maxlen)  # maxlen 확인하기

maxlen = 100
rowdata = []
for w in text_encoded:
  if len(w) >=maxlen:
    rowdata.append(w[:maxlen])
  else:
    rowdata.append(np.pad(w, (0, maxlen), 'constant', constant_values=0)[:maxlen])
text_padded = np.concatenate(rowdata, axis=0).reshape(-1, maxlen)
print(np.shape(text_padded))

1.4 학습을 위한 dataset 만들기 및 학습 과정

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import LongTensor as LT
from torch import FloatTensor as FT

class Generate_Dataset(torch.utils.data.Dataset):
  def __init__(self, xdata, ydata, device):
    self.x_data = xdata
    self.y_data = ydata
    self.device = device

  def __len__(self):
    return len(self, x_data)
  # rapping~
  # LT : 정수 인코더
  def __getitem__(self, idx):
    x = LT(self.x_data[idx]).to(self.device)
    y = LT(self.y_data[idx]).to(self.device)
    return x , y 


In [None]:
dataset = Generate_Dataset(text_padded[:5000], text_label[:5000].reshape([-1,1]), device)
trainset, testset = random_split(dataset, [4500, 500])
train_loader = DataLoader(trainset, batch_size=256, shuffle= True)
test_loader = DataLoader(testset, batch_size = 500, shuffle = False)

In [None]:
# network, optimizer define

lstm_net = LSTM_net(num_output = 2, size_vocab= len(tokenizer, word_index), dim_embed = 64,
                    hidden_size= 64, linear_size = 64, num_layers = 1, device= device)
optimizer = torch.optim.Adam(lstm_net.parameters(), lr = 0.01)

training session

In [None]:
from tqdm import tqdm
for epoch in range(10):
  print('epoch', epoch)
  with tqdm(train_loader, unit= 'batch') as tepoch:
    for x, y in tepoch:
      predict = lstm_net(x)
      loss = torch.nn.functional.cross_entropy(predict, y.ravel()) #ravel = [x.1]->[x]로 squeeze
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print(loss)


tset performernce

In [None]:
with tqdm(test_loader, unit='batch') as tepoch:
  for x, y in tepoch:
    predict = lstm_net(x).argax(1).detach().numpy()
    answer = y.ravel().detach().numpy()
score = 0
for i in range(len(predict)):
  if predict[i] == answer[i]:
    score +=1
print(score, 'out of 500, accuracy is', score/500*100, '%')

2. seq2seq 모델

LSTM 활용 기계번역 구현

In [3]:
!pip install torchtext==0.10.0.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0.
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 15.0 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.

In [4]:
import os
import spacy
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field
os.system('python -m spacy download en_core_web_sm')
os.system('python -m spacy download de_core_news_sm')

spacy_german = spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')


In [5]:
def tokenize_de(text):
  return [tok.text for tok in spacy_german.tokenizer(text)][::-1]
def tokenize_en(text):
  return [tok.text for tok in spacy_english.tokenizer(text)]
SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token= '<eos>', lower= True )
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token= '<eos>', lower= True )


In [6]:
train_data , val_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

downloading training.tar.gz


ConnectionError: ignored

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [None]:
# network structure

import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

class seq_Encoder(nn.Module):
  def __init__(self, vocab_size, dim_embed, hidden_size, num_layers, dropout):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embed = nn.Embedding(vocab_size, dim_embed)
    self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    outputs, (hidden, cell) = self.lstm(self.dropout(self.embed(src)))
    return hidden, cell

In [None]:
# context vector을 받아와서 lstm layer거쳐서 fclayer 붙여서 

class seq_Decoder(nn.module):
  def __init__(self, output_size, dim_embed, hidden_size, num_layers, dropout):
    super().__init__()

    self.output_size = output_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embed = nn.Embedding(output_size, dim_embed)
    self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
    self.fclayer = nn.Linear(hidden_size, output_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input_data, hidden, cell):

    input_data = input_data.unsqueeze(0) # [a,b] -> [1,a,b] 차원확장
    embedded = self.dropout(self.embed(input_data))
    output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
    prediction = self.fclayer(output.squeeze(0))

    return prediction, hidden , cell

In [None]:
import random

class seq2seq(nn.module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
  
  #target [ seqlen, batch, emb-dim]
  def forward(self, source, target, tf_ratio = 0.5):
    batch_size = target.shape[1]
    translation_length = target.shape[0]
    target_vacab_size = self.decoder.output_size

    # 저장하기 위해 미리 zoros 만들어놓음
    outputs = torch.zeros(translation_length, batch_size, target_vacab_size).to(self.device)
    hidden, cell = self.encoder(source)
    input_trans = target[0,:]  # sos 

    for t in range(1, translation_length):
      output, hidden, cell = self.decoder(input_trans, hidden, cell)
      outputs[t] = output

      # teacher force : 정답의 일부를 가져가서 정답을 알려줌 
      teacher_force = random.random() < tf_ratio
      input_trans = target[t] if teacher_force else output.argmax(1)
    return outputs


In [None]:
device = torch.device('cuda:0' if torch.cuda.availavle() else 'cpu')
enc = seq_Encoder(len(SRC.vocab), 64, 64, 1, 0.3)
dec = seq_Decoder(len(TRG.vocab), 64, 64, 1, 0.3)
seq_net = seq2seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(seq_net.parameters(), lr= 0.01)


In [None]:
from torchtext.legacy.data import BuckertIterater
train_iterator, val_iterator, test_iterator = BuckertIterater.splits((train_data, val_data, test_data), batch_size= 256, device)


In [None]:
# train

pad_index= TRG.vocab.stoi[TRG.pad_token]
lossfn = nn.CrossEntropy(ignore_index = pad_index)  # padding을 제외할 수 있음


#prediction : [seq-len, batch, vocab]
#label : [seq-len, batch, 1]
#cross-ent : label, 1 batch
#             logit 2 batch.onehot -->차원이 크기 때문에 pred, label의 차원을 view -1을 통해 합치기
for epoch in range(10):
  loss_epoch = 0
  for batch in train_iterator:
    source_data = batch.src
    target_data = batch.trg
    target_pred = seq_net(source_data, target_data)
    target_pred = target_pred[1:].view(-1, target_pred.shape[-1])
    target_data = target_data[1:].view(-1)
    optimizer.zero_grad()
    loss = lossfn(target_pred, target_data)
    loss.backward()
    optimizer.step()
    loss_epoch += loss.item()
  
  print('epoch', epoch, 'loss', loss_epoch/len(train_iterator))
    