In [1]:
import json
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

# nltk.download('punkt')
# nltk.download('stopwords')

# Load JSON data
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Inisialisasi
stemmer = PorterStemmer()
stop_words = set(stopwords.words('indonesian'))

X = []
y = []

# Ekstrak dan preprocessing
for intent in data['intents']:
    for pattern in intent['patterns']:
        # Lowercase
        text = pattern.lower()

        # Hapus tanda baca
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenisasi
        tokens = nltk.word_tokenize(text)

        # Stopwords removal dan stemming
        filtered = [stemmer.stem(word) for word in tokens if word not in stop_words]

        X.append(" ".join(filtered))
        y.append(intent['tag'])

# Encode label (tag -> angka)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Output hasil
for i in range(50):
    print(f"Teks: {X[i]} -> Label: {y[i]} (Encoded: {y_encoded[i]})")


Teks: hai -> Label: salam (Encoded: 5)
Teks: hi -> Label: salam (Encoded: 5)
Teks: halo -> Label: salam (Encoded: 5)
Teks: kabar -> Label: salam (Encoded: 5)
Teks: selamat pagi -> Label: salam (Encoded: 5)
Teks: selamat siang -> Label: salam (Encoded: 5)
Teks: selamat malam -> Label: salam (Encoded: 5)
Teks: salam -> Label: salam (Encoded: 5)
Teks: ping -> Label: salam (Encoded: 5)
Teks: p -> Label: salam (Encoded: 5)
Teks: dadah -> Label: bye (Encoded: 1)
Teks: selamat tinggal -> Label: bye (Encoded: 1)
Teks: dah -> Label: bye (Encoded: 1)
Teks: daah -> Label: bye (Encoded: 1)
Teks: semoga harimu menyenangkan -> Label: bye (Encoded: 1)
Teks: ok makasih -> Label: bye (Encoded: 1)
Teks: jumpa -> Label: bye (Encoded: 1)
Teks: ok bye -> Label: bye (Encoded: 1)
Teks: sakit kepala -> Label: sakit_kepala (Encoded: 4)
Teks: kepala sakit -> Label: sakit_kepala (Encoded: 4)
Teks: puse -> Label: sakit_kepala (Encoded: 4)
Teks: sakit kepala banget -> Label: sakit_kepala (Encoded: 4)
Teks: puse ->

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# 1. Vektorisasi teks
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# 2. Latih model Naive Bayes
model = MultinomialNB()
model.fit(X_vectorized, y_encoded)

# 3. Fungsi untuk memproses input pengguna
def preprocess_input(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    filtered = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(filtered)

# 4. Fungsi chatbot
def chatbot_response(text):
    processed = preprocess_input(text)
    vect_text = vectorizer.transform([processed])
    prediction = model.predict(vect_text)[0]
    tag = le.inverse_transform([prediction])[0]
    
    # Ambil respon berdasarkan tag dari dataset
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

# 5. Loop interaktif chatbot
import random

print("KBOT: Hai! Ketik 'keluar' untuk berhenti.")
while True:
    user_input = input("Kamu: ")
    if user_input.lower() in ["keluar", "exit", "quit"]:
        print("KBOT: Sampai jumpa lagi yaa!")
        break
    response = chatbot_response(user_input)
    print("KBOT:", response)


KBOT: Hai! Ketik 'keluar' untuk berhenti.


KBOT: Hai
KBOT: Sampai jumpa lagi yaa!
