In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import random
import re
import pickle
import pandas as pd

In [2]:
def clean_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'([^\w\s])', r' \1 ', text) # replace punctuation with space
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space
    text = text.strip()

    return text

In [3]:
def indexsFromSentence(vocab, sentence):
    return [vocab.get(word, vocab['<UNK>']) for word in sentence.split(' ')]

def tensorFromSentence(vocab, sentence):
    indexes = indexsFromSentence(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device = device).view(-1, 1)

In [4]:
hidden_size = 256
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3
MAX_LENGTH = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Seq2Seq Model

In [5]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=2)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden
    
    def initHidden(self):
        return (torch.zeros(2, 1, self.hidden_size, device=device), 
                torch.zeros(2, 1, self.hidden_size, device=device))
    

class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, hidden
    
    def initHidden(self):
        return (torch.zeros(2, 1, self.hidden_size, device=device), 
                torch.zeros(2, 1, self.hidden_size, device=device))

In [6]:
def train(input_tensor, target_tensor, encocer, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        
    decoder_input = torch.tensor([[SOS_token]], device=device)
    
    decoder_hidden = encoder_hidden
    
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        
        loss += criterion(decoder_output, target_tensor[di])
        
        if decoder_input.item() == EOS_token:
            break
        
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length

In [7]:
def trainIters(encoder, decoder, n_iters, print_every = 1000, learning_rate = 0.01):
    print_loss_total = 0

    for iter in range(1, n_iters+1):
        training_pair = random.choice(pairs)
        input_tensor = tensorFromSentence(word_to_ix, training_pair[0]).to(device)
        target_tensor = tensorFromSentence(word_to_ix, training_pair[1]).to(device)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print(f'Iteration: {iter}, Loss: {print_loss_avg: .4f}')
            print_loss_total = 0

In [8]:
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(word_to_ix, sentence).to(device)
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.initHidden()
        encoder_hidden = tuple([e.to(device) for e in encoder_hidden])

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden
        decoder_words = [] # output sentence

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                decoder_words.append('<EOS>')
                break
            else:
                decoder_words.append(ix_to_word[topi.item()])

            decoder_input = topi.squeeze().detach()
        
        return ' '.join(decoder_words)

In [27]:
# 챗봇
def chat(encoder, decoder, max_length = MAX_LENGTH):
    print("Let's Chat! (type 'bye' to exit)")
    while True:
        input_sentence = input('You: ')
        print(f'You: {input_sentence}')
        if input_sentence == 'bye':
            break

        output_sentence = evaluate(encoder, decoder, input_sentence)
        print(f'Bot: {output_sentence}')

### Run

In [10]:
df = pd.read_csv('data/chatbot_dataset.txt', sep = '\t', names = ['Question', 'Answer'])
df['Encoder Inputs'] = df['Question'].apply(clean_text)
df['Decoder Inputs'] = df['Answer'].apply(clean_text)

In [11]:
df['Encoder Inputs']

0                               hi , how are you doing ?
1                      i ' m fine . how about yourself ?
2                i ' m pretty good . thanks for asking .
3                    no problem . so how have you been ?
4                   i ' ve been great . what about you ?
                             ...                        
295             how long have you known how to do that ?
296        i first learned how to do it in high school .
297    did you take some sort of art class or somethi...
298                         that was my favorite class .
299                        you have got to be talented .
Name: Encoder Inputs, Length: 300, dtype: object

In [12]:
input_sentence = [sentence for sentence in df['Encoder Inputs']]
output_sentence = [sentence + "<EOS>" for sentence in df['Decoder Inputs']]

In [13]:
output_sentence[:5]

["i ' m fine . how about yourself ?<EOS>",
 "i ' m pretty good . thanks for asking .<EOS>",
 'no problem . so how have you been ?<EOS>',
 "i ' ve been great . what about you ?<EOS>",
 "i ' ve been good . i ' m in school right now .<EOS>"]

In [14]:
# 단어 사전 생성
all_words = set(' '.join(df['Encoder Inputs'].tolist()+df['Decoder Inputs'].tolist()).split())

In [15]:
all_words

{"'",
 ',',
 '.',
 '?',
 'a',
 'about',
 'absolutely',
 'activities',
 'actually',
 'after',
 'again',
 'ago',
 'ahead',
 'air',
 'alice',
 'all',
 'already',
 'always',
 'am',
 'an',
 'and',
 'anita',
 'answered',
 'any',
 'anything',
 'apologize',
 'appreciate',
 'are',
 'around',
 'art',
 'asking',
 'at',
 'attend',
 'attending',
 'available',
 'back',
 'bad',
 'badly',
 'be',
 'beach',
 'beautiful',
 'because',
 'been',
 'before',
 'believe',
 'better',
 'big',
 'bismol',
 'boss',
 'bothering',
 'bought',
 'brand',
 'brown',
 'bumped',
 'busy',
 'but',
 'buy',
 'california',
 'call',
 'called',
 'calling',
 'campus',
 'can',
 'change',
 'changing',
 'chores',
 'chuck',
 'chucks',
 'class',
 'classes',
 'clean',
 'cleaner',
 'cleaning',
 'clear',
 'clearly',
 'clears',
 'closer',
 'cold',
 'come',
 'congratulations',
 'considering',
 'constantly',
 'cool',
 'cost',
 'could',
 'couple',
 'cute',
 'd',
 'day',
 'days',
 'deal',
 'degree',
 'degrees',
 'describe',
 'deserved',
 'did',


In [16]:
vocab = {'<PAD>': PAD_token, '<SOS>': SOS_token, '<EOS>': EOS_token, '<UNK>': UNK_token}
vocab.update({word: idx+4 for idx, word in enumerate(all_words)})
vocab_size = len(vocab)

# 단어 사전 저장
with open('data/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [17]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '<UNK>': 3,
 'not': 4,
 'noticed': 5,
 'people': 6,
 'clearly': 7,
 'rains': 8,
 'cool': 9,
 'girls': 10,
 'a': 11,
 'took': 12,
 'rain': 13,
 'nice': 14,
 'had': 15,
 'easier': 16,
 'hello': 17,
 'think': 18,
 'from': 19,
 'lovely': 20,
 'sky': 21,
 'prettiest': 22,
 'tell': 23,
 'painting': 24,
 'sooner': 25,
 'because': 26,
 'yet': 27,
 'while': 28,
 'fine': 29,
 'our': 30,
 'about': 31,
 'look': 32,
 'find': 33,
 'bad': 34,
 'better': 35,
 'plan': 36,
 'only': 37,
 'back': 38,
 'me': 39,
 'will': 40,
 'attend': 41,
 'so': 42,
 'fun': 43,
 'beautiful': 44,
 'considering': 45,
 'degree': 46,
 'sort': 47,
 'these': 48,
 'winter': 49,
 'campus': 50,
 'excited': 51,
 'are': 52,
 'until': 53,
 'how': 54,
 'clean': 55,
 'change': 56,
 'wait': 57,
 'hang': 58,
 'am': 59,
 'apologize': 60,
 'exactly': 61,
 'says': 62,
 'bismol': 63,
 'somewhere': 64,
 'feet': 65,
 'week': 66,
 'down': 67,
 'did': 68,
 'when': 69,
 'what': 70,
 'ninety': 71,
 'which': 

In [18]:
word_to_ix = vocab
ix_to_word = {i: word for word, i in word_to_ix.items()}

In [19]:
word_to_ix['hello']

17

In [20]:
encoder = EncoderLSTM(vocab_size, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, vocab_size).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.005)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()

In [21]:
# 학습 데이터 생성
pairs = [list(x) for x in zip(df['Encoder Inputs'], df['Decoder Inputs'])]

In [22]:
pairs[1]

["i ' m fine . how about yourself ?",
 "i ' m pretty good . thanks for asking ."]

In [23]:
# 학습 실험
trainIters(encoder, decoder, 15000, print_every=1000)

Iteration: 1000, Loss:  3.0642
Iteration: 2000, Loss:  2.8006
Iteration: 3000, Loss:  2.5807
Iteration: 4000, Loss:  2.3017
Iteration: 5000, Loss:  1.9613
Iteration: 6000, Loss:  1.6375
Iteration: 7000, Loss:  1.3488
Iteration: 8000, Loss:  1.2294
Iteration: 9000, Loss:  1.0958
Iteration: 10000, Loss:  0.9555
Iteration: 11000, Loss:  0.8563
Iteration: 12000, Loss:  0.7075
Iteration: 13000, Loss:  0.6445
Iteration: 14000, Loss:  0.6656
Iteration: 15000, Loss:  0.6687


In [24]:
torch.save(encoder.state_dict(), 'data/encoder_tmp.pth')
torch.save(decoder.state_dict(), 'data/decoder_tmp.pth')

In [25]:
# 챗봇 실행
encoder.eval()
decoder.eval()

DecoderLSTM(
  (embedding): Embedding(433, 256)
  (lstm): LSTM(256, 256, num_layers=2)
  (out): Linear(in_features=256, out_features=433, bias=True)
)

In [28]:
chat(encoder, decoder)

Let's Chat! (type 'bye' to exit)
You: hello
Bot: oh . how much to tall . <EOS>
You: how are you?
Bot: i really great . <EOS>
You: how old are you?
Bot: how really great . how . how lot . <EOS>
You: what did you eat today?
Bot: yes , offered me . <EOS>
You: bye
