In [2]:
# import library
import string
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from util import JSONParser

In [3]:
# load data
path = "data/intents.json"

# buat objek JSONParser dan parse data intents.json
jp = JSONParser()
jp.parse(path)

# simpan dataframe dalam variabel df
df = jp.get_dataframe()

[INFO] Data JSON converted to DataFrame with shape : (57, 2)


In [4]:
# lihat 5 data pertama
df.head()

Unnamed: 0,text_input,intents
0,Hai,salam
1,Hi,salam
2,Halo,salam
3,Apa Kabar,salam
4,Selamat Pagi,salam


In [5]:
# hitung jumlah data per tag / inten
df.intents.value_counts()

intents
salam               10
bye                  8
komunitas            7
nama                 6
omnichannel_typo     6
pekerjaan            5
omnichannel          5
youtube              5
kemampuan            5
Name: count, dtype: int64

In [6]:
def preprocess(chat):
    """
    Fungsi yang digunakan untuk melakukan praproses
    """
    # konversi ke lowercase
    chat = chat.lower()
    # menghapus tanda baca
    tandabaca = tuple(string.punctuation)
    chat = "".join(ch for ch in chat if ch not in tandabaca)
    return chat

In [8]:
# implementasikan fungsi preprocess ke string
df["text_input_prep"] = df.text_input.apply(preprocess)

In [9]:
df[["text_input", "text_input_prep"]].head(10)

Unnamed: 0,text_input,text_input_prep
0,Hai,hai
1,Hi,hi
2,Halo,halo
3,Apa Kabar,apa kabar
4,Selamat Pagi,selamat pagi
5,Selamat Siang,selamat siang
6,Selamat Malam,selamat malam
7,Salam,salam
8,Ping,ping
9,P,p


In [13]:
# inisiasi objek CountVectorizer
vect = CountVectorizer()

In [14]:
# mengumpulkan vocab dari data teks yang sudah dilakukan praproses
vect.fit(df["text_input_prep"])

In [18]:
# lihat list vocab
vect.get_feature_names_out()[:10]  # batasi hanya 10 vocab teratas

array(['ada', 'aja', 'anda', 'apa', 'apaan', 'bantu', 'bantuin', 'bisa',
       'bye', 'channel'], dtype=object)

In [19]:
# ubah data teks menjadi matriks
text_vect = vect.transform(df.text_input_prep)

text_vect

<57x75 sparse matrix of type '<class 'numpy.int64'>'
	with 160 stored elements in Compressed Sparse Row format>

In [21]:
pd.DataFrame(text_vect.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,ada,aja,anda,apa,apaan,bantu,bantuin,bisa,bye,channel,...,sih,tanyatanya,tinggal,tugas,tuh,urlnya,wah,ya,yang,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# deklarasi objek MultinomialNB
nb = MultinomialNB()

# training data, dengan X : text_vect dan y : intents
nb.fit(text_vect, df.intents)

In [23]:
# input string dari user
chat = input("Masukkan String : ")

# lakukan preproses
chat = preprocess(chat)

# ubah teks menjadi vektor
chat = vect.transform([chat])

# prediksi vektor teks kedalam model machine learning
res = nb.predict(chat)

# tampilkan hasil prediksi
print(f"Hasil prediksi : {res[0]}")

Hasil prediksi : salam


In [24]:
# input string dari user
chat = input("Masukkan String : ")

# lakukan preproses
chat = preprocess(chat)

# ubah teks menjadi vektor
chat = vect.transform([chat])

# prediksi vektor teks kedalam model machine learning
res = nb.predict_proba(chat)

# ambil nilai probabilitas tertinggi
max_prob = max(res[0])
max_idx = np.argmax(res[0])
print(f"Max Prob : {max_prob}\nMax Index: {max_idx}\nLabel: {nb.classes_[max_idx]}")

Max Prob : 0.41434924936751305
Max Index: 3
Label: nama


In [25]:
# Deklarasi pipeline yang mengandung vektorisasi (CountVectorizer) & pemodelan (MultinomialNB)
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

# Training
pipe.fit(df.text_input, df.intents)

In [26]:
# input string dari user
chat = input("Masukkan String : ")

# lakukan preproses
chat = preprocess(chat)

# prediksi teks kedalam pipeline
res = pipe.predict_proba([chat])

# ambil nilai probabilitas tertinggi
max_prob = max(res[0])
max_idx = np.argmax(res[0])
print(f"Max Prob : {max_prob}\nMax Index: {max_idx}\nLabel: {nb.classes_[max_idx]}")

Max Prob : 0.5350913763086662
Max Index: 1
Label: kemampuan


In [None]:
print("Anda Terhubung dengan chatbot Kami")
while True:
    # input user
    chat = input("Anda : ")
    # praproses
    chat = preprocess(chat)
    # prediksi intent
    res = pipe.predict_proba([chat])
    # ambil nilai probabilitas & lokasinya
    max_prob = max(res[0])
    max_idx = np.argmax(res[0])
    # kondisi jika probabilitas kurang dari threshold
    if max_prob < 0.20:
        print("Bot : Maaf Kak, aku ga ngerti")
    else:
        print(f"Bot : {jp.get_response(nb.classes_[max_idx])}")
    if nb.classes_[max_idx] == "bye":
        break

In [30]:
with open("data/model/chatbot_pipeline.pkl", "wb") as model_file:
    pickle.dump(pipe, model_file)