In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 데이터 준비

In [2]:
SOS_token = 0 # 문장의 시작
EOS_token = 1 # 문장의 끝
MAX_LENGTH = 20

class Lang:
  def __init__(self):
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0:'SOS', 1:'EOS'}
    self.n_words = 2 # SOS, EOS

  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2index[word] += 1

## 데이터 정규화

In [11]:
data = pd.read_table('/content/drive/MyDrive/pytorch/kor.txt', header=None, names=['kor','eng','ex'])
data = data[['kor', 'eng']]
data.to_csv('/content/drive/MyDrive/pytorch/kor.csv')

In [12]:
def normalizeString(df, lang):
  sentence = df[lang].str.lower()
  sentence = sentence.str.replace('[^A-Za-z\s]+', ' ')
  sentence = sentence.str.normalize('NFD')
  sentenct = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
  return sentence

def read_sentence(df, lang1, lang2):
  sentence1 = normalizeString(df, lang1)
  sentence2 = normalizeString(df, lang2)
  return sentence1, sentence2

def read_file(loc, lang1, lang2):
  df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1, lang2])
  return df

def process_data(lang1, lang2):
  df = read_file('./content/drive/MyDrive/pytorch/kor.txt')
  sentence1, sentence2 = read_sentence(df, lang1, lang2)

  input_lang = Lang()
  output_lang = Lang()
  pairs = []
  for i in range(len(df)):
    if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH:
      full = [sentence1[i], sentence2[i]]
      input_lang.addSentence(sentence1[i])
      output_lang.addSentence(sentence2[i])
      pairs.append(full)

  return input_lang, output_lang, pairs

## 텐서로 변환

In [13]:
def indexesFromSentence(lang, sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1,1)

def tensorFromPair(input_lang, output_lang, pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  output_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, output_tensor)

## 인코더 네트워크

In [15]:
class Encoder(nn.Module):
  def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers,):
    super(Encoder, self).__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.embbed_dim = embbed_dim
    self.num_layers = num_layers
    self.embeding = nn.Embedding(input_dim, self.embbed_dim)
    self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)

  def forward(self, src):
    embedded = self.embedding(src).view(1,1,-1)
    outputs, hidden = self.gru(embedded)
    return outputs, hidden

## 디코더 네트워크

In [19]:
class Decoder(nn.Module):
  def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
    super(Decoder, self).__init__()

    self.embbed_dim = embbed_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.num_layers = num_layers

    self.embedding = nn.Embedding(output_dim, self.embbed_dim)
    self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
    self.out = nn.Linear(self.hidden_dim, output_dim)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    input = input.view(1,-1)
    embedded = F.relu(self.embedding(input))
    output, hidden = self.gru(embedded, hidden)
    prediction = self.softmax(self.out(output[0]))
    return prediction, hidden