In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import random
import json

In [85]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

okt = Okt()

In [86]:
data = pd.read_json("./intents.json")

In [87]:
data.loc[0][0]

{'tag': 'greeting',
 'patterns': ['안녕?', '반가워.', '반가웡', '헤이', '왓썹', '방가방가', '안뇽'],
 'responses': ['안녕ㅋㅋ', '하이!', '헬로우', 'ㅋㅋㅋ', '머해 머해', '나도 반갑다 펭귄'],
 'context_set': ''}

In [88]:
words = []
labels = []
docs_x = []
docs_y = []

In [89]:
for row in data.intents:
    tag = row.get("tag")
    for pattern in row.get("patterns"):
        wrds = okt.morphs(pattern)
        t_wrds = [w for w in wrds if w not in words]
        words.extend(t_wrds)
        
        docs_x.append(wrds)
        docs_y.append(tag)

    if tag not in labels:
        labels.append(tag)

In [90]:
words

['안녕',
 '?',
 '반가워',
 '.',
 '반가웡',
 '헤이',
 '왓썹',
 '방가',
 '방가',
 '안뇽',
 '넌',
 '누구',
 '야',
 '니',
 '너',
 '가',
 '인지',
 '궁금해',
 '이름',
 '이',
 '뭐',
 '어디서',
 '왔어',
 '왔니',
 '너무',
 '귀여워',
 '!',
 '매력',
 '만점',
 '이야',
 '이제',
 '그만',
 'ㅎㅎ',
 '즐거',
 '웟',
 '어',
 '잘가']

In [91]:
max_length = len(words)

max_length

37

In [92]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
result = le.fit_transform(docs_y)

result

array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 0, 0,
       0], dtype=int64)

In [93]:
class CustomDataset(Dataset):
    def __init__(self, patterns, labels):
        self.patterns = patterns
        self.labels = labels

    def __len__(self):
        return len(self.patterns)

    def __getitem__(self,idx):
        return torch.tensor(self.patterns[idx]), torch.tensor(self.labels[idx])

In [94]:
one_hot_encoding = torch.zeros(max_length)

one_hot_encoding[1] = 1

one_hot_encoding

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])

In [95]:
random.shuffle(words)

print(words)

['어디서', '!', '넌', '그만', '너', '안녕', '이제', '너무', '방가', '반가웡', '웟', '이야', '왔니', '왔어', 'ㅎㅎ', '야', '인지', '가', '반가워', '귀여워', '뭐', '이름', '즐거', '?', '매력', '어', '안뇽', '왓썹', '헤이', '니', '.', '누구', '방가', '잘가', '만점', '이', '궁금해']


In [96]:
training_x = []

for x in docs_x:
    one_hot_encoding = torch.zeros(max_length)

    for word in x:
        if word in words:
            t_index = words.index(word)
            one_hot_encoding[t_index] = 1

    training_x.append(one_hot_encoding.tolist())

In [104]:
training_y = []

for label in docs_y:
    one_hot_encoding = np.zeros(len(labels))
    
    t_index = labels.index(label)

    one_hot_encoding[t_index] = 1

    training_y.append(t_index)

training_y

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4]

In [105]:
training_dataset = CustomDataset(training_x, training_y)

In [106]:
training_dataloader = DataLoader(training_dataset, batch_size=4, shuffle=True)

In [107]:
len(training_dataloader)

6

In [108]:
class Net(nn.Module):
    def __init__(self, input_size, label_size):
        super(Net, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, label_size)
        )

    def forward(self, x):
        return self.layer(x)

In [109]:
model = Net(max_length, len(labels))

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [110]:
nb_epochs = 100

for epoch in range(nb_epochs):
    avg_loss = 0

    for i, (pattern, label) in enumerate(training_dataloader):
        pred = model(pattern)

        loss = criterion(pred, label)
        avg_loss += loss 
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss /= len(training_dataloader)
    print(f"Epoch: [{epoch}/{nb_epochs}] / Loss: {avg_loss}")

Epoch: [0/100] / Loss: 1.6009521484375
Epoch: [1/100] / Loss: 1.3709951639175415
Epoch: [2/100] / Loss: 0.9497727751731873
Epoch: [3/100] / Loss: 0.5204460024833679
Epoch: [4/100] / Loss: 0.2673269510269165
Epoch: [5/100] / Loss: 0.12054628878831863
Epoch: [6/100] / Loss: 0.07913297414779663
Epoch: [7/100] / Loss: 0.022363364696502686
Epoch: [8/100] / Loss: 0.007029101252555847
Epoch: [9/100] / Loss: 0.0023170432541519403
Epoch: [10/100] / Loss: 0.0009882923914119601
Epoch: [11/100] / Loss: 0.0006389086483977735
Epoch: [12/100] / Loss: 0.00042128676432184875
Epoch: [13/100] / Loss: 0.0003173970908392221
Epoch: [14/100] / Loss: 0.0002564969763625413
Epoch: [15/100] / Loss: 0.00019871963013429195
Epoch: [16/100] / Loss: 0.00016030836559366435
Epoch: [17/100] / Loss: 0.0001238281693076715
Epoch: [18/100] / Loss: 0.00010843225754797459
Epoch: [19/100] / Loss: 9.222234803019091e-05
Epoch: [20/100] / Loss: 8.613857789896429e-05
Epoch: [21/100] / Loss: 7.609821477672085e-05
Epoch: [22/100] / 

In [117]:
def inference(word):
    word = okt.morphs(word)

    one_hot_encoding = torch.zeros(max_length)

    if word in words:
        t_index = words.index(word)
        one_hot_encoding[t_index] = 1

    encoded_label = torch.argmax(model(one_hot_encoding))

    tag = labels[encoded_label]

    responses = list(filter(lambda x: x["tag"] == tag, data.intents.tolist()))
    response = random.choice(responses[0]["responses"])
    return response

In [120]:
print(inference("아 엔터키 ㅡ게 만드는거ㅋㅋㅋㅋㅋㅋㅋ"))

헬로우
