In [14]:
!pip install malaya
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [22]:
# Tokenizer

import numpy as np
import malaya
from malaya.tokenizer import Tokenizer

tokenizer = Tokenizer()
sastrawi = malaya.stem.sastrawi()

def tokenize(sentence):
    """
    Split sentence into array of tokens.
    A token can be a word or punctuation character, or number.
    """

    return tokenizer.tokenize(sentence)


def stem(word):
    """
    Stemming is a process to find the root form of the word.

    Example:
    words = ["menyeru", "menyerukanlah"]
    words = [stem(w) for w in words]
    -> ["seru"]
    """

    return sastrawi.stem(word.lower())


def bag_of_words(tokenized_sentence, words):
    """
    Return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise.

    Example:
    sentence = ["apa", "khabar"]
    words = ["hello", "apa", "khabar"]
    bow = [0, 1, 1]
    """

    # Stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # Initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words:
            bag[idx] = 1

    return bag

In [7]:
# Model

import torch.nn as nn

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        # No activation and softmax at the end

        return out

In [31]:
# Training

import json
import torch
import torch.nn
from torch.utils.data import Dataset, DataLoader

with open("intents.json", "r") as f:
  intents = json.load(f)

  all_words = []
  tags = []
  xy = []

  for intent in intents["intents"]:
    tag = intent["tag"]
    tags.append(tag)

    for pattern in intent["patterns"]:
      word = tokenize(pattern)
      all_words.extend(word)
      xy.append((word, tag))

  ignore_words = ["?", ".", "!"]
  all_words = [stem(w) for w in all_words if w not in ignore_words]
  tags = sorted(set(tags))

  print(f"patterns: {len(xy)}", xy)
  print(f"tags: {len(tags)}", tags)
  print(f"unique stemmed words: {len(all_words)}", all_words)

  # Create training data
  x_train = []
  y_train = []

  for (pattern, tag) in xy:
    bag = bag_of_words(pattern, all_words)
    x_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

  x_train = np.array(x_train)
  y_train = np.array(y_train)

  # Hyperparameters
  # See: https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html#hyperparameters
  num_epochs = 1000
  batch_size = 8
  learning_rate = 0.001
  input_size = len(x_train[0])
  hidden_size = 8
  output_size = len(tags)

  print(f"input size: {input_size}, output size: {output_size}")

  class ChatDataset(Dataset):
    def __init__(self):
      self.n_samples = len(x_train)
      self.x_data = x_train
      self.y_data = y_train

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

  dataset = ChatDataset()
  train_loader = DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      shuffle=True,
      num_workers=0
  )

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = NeuralNet(input_size, hidden_size, output_size).to(device)

  # Loss and optimizer
  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  for epoch in range(num_epochs):
    for (words, labels) in train_loader:
      words = words.to(device)
      labels = labels.to(dtype=torch.long).to(device)
      outputs = model(words)
      loss = loss_fn(outputs, labels)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    if (epoch+1) % 100 == 0:
      print (f"epoch {epoch+1}/{num_epochs}, loss: {loss.item():.8f}")

  print(f"final loss: {loss.item():.8f}")

  torch.save({
    "model_state": model.state_dict(),
    "input_size": input_size,
    "hidden_size": hidden_size,
    "output_size": output_size,
    "all_words": all_words,
    "tags": tags
  }, "data.pth")


patterns: 26 [(['hi'], 'greeting'), (['hai'], 'greeting'), (['hello'], 'greeting'), (['assalamualaikum'], 'greeting2'), (['salam'], 'greeting2'), (['slm'], 'greeting2'), (['mat', 'pagi'], 'greeting_time_morning'), (['selamat', 'pagi'], 'greeting_time_morning'), (['pagi'], 'greeting_time_morning'), (['mat', 'malam'], 'greeting_time_night'), (['selamat', 'malam'], 'greeting_time_night'), (['malam'], 'greeting_time_night'), (['siapa', 'awak', '?'], 'name'), (['siapa', 'kau', '?'], 'name'), (['kau', 'ni', 'siapa', '?'], 'name'), (['nama', 'apa', '?'], 'name'), (['apa', 'nama', 'kau', '?'], 'name'), (['single', 'tak', '?'], 'relationship_status'), (['taken', 'ke', '?'], 'relationship_status'), (['single', 'or', 'taken'], 'relationship_status'), (['taken', '?'], 'relationship_status'), (['single', '?'], 'relationship_status'), (['apa', 'status', 'relay', 'kau', '?'], 'relationship_status'), (['ada', 'calon', 'tak', '?'], 'relationship'), (['ada', 'gf', '?'], 'relationship'), (['ada', 'girlfr

In [32]:
# Chat

import random
import json
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open("intents.json", "r") as json_data:
  intents = json.load(json_data)
  data = torch.load("data.pth")

  input_size = data["input_size"]
  hidden_size = data["hidden_size"]
  output_size = data["output_size"]
  all_words = data["all_words"]
  tags = data["tags"]
  model_state = data["model_state"]

  model = NeuralNet(input_size, hidden_size, output_size).to(device)
  model.load_state_dict(model_state)
  model.eval()

  while True:
      sentence = input("You: ")
      if sentence == "quit":
          break

      sentence = tokenize(sentence)
      x = bag_of_words(sentence, all_words)
      x = x.reshape(1, x.shape[0])
      x = torch.from_numpy(x).to(device)

      output = model(x)
      _, predicted = torch.max(output, dim=1)

      tag = tags[predicted.item()]
      probs = torch.softmax(output, dim=1)
      prob = probs[0][predicted.item()]
      if prob.item() > 0.75:
          for intent in intents["intents"]:
              if tag == intent["tag"]:
                  print(f"Bot: {random.choice(intent['responses'])}")
      else:
          print("Bot: tak paham")

You: hai awak
Bot: hello awaks
You: single or taken
Bot: masih single
You: mat pagi
Bot: morning wak
You: ada gf tak?
Bot: tak sebab tak hesmes :(
You: apakah status relay anda?
Bot: single abadi


KeyboardInterrupt: Interrupted by user