In [48]:
import re
import string
import numpy as np
from csv import DictReader
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support

#Import stuffs

In [23]:
def preprocessing(dataset):
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()
    for row in dataset:
        row['message'] = row.get('message').casefold()
        row['message'] = re.sub(r"[0-9]", "", row.get('message'))
        row['message'] = re.sub('['+string.punctuation+']', "", row.get('message'))
        row['message_stopwords'] = stopwords.remove(row['message'])
        row['message_stemmed'] = stemmer.stem(row['message_stopwords'])
        row['message_tokenized'] = word_tokenize(row['message_stemmed'])
        
#Function untuk preprocess data

In [24]:
dataset = []
with open('D:\Kuliah\PBA\FP\dataset.csv', 'r') as file:
    reader = DictReader(file, delimiter=';')
    for row in reader:
        if row['voting'] == '':
            dataset[len(dataset)-1]['response'] = row['message']
        else:
            dataset.append(
                    {
                        'message': row['message'],
                        'category' : row['voting']
                    }
                )

preprocessing(dataset)
shuffle(dataset)
#Preprocess data

Data ada 505 line pertanyaan, jadi dibagi 80% data training yaitu 404 dan 20% data testing yaitu 101

In [25]:
datatrain = dataset[101:]
datatest  = dataset[:101]

In [26]:
for x in dataset:
    print(x)

{'message': 'okedeh thanks', 'category': 'konfirmasi', 'response': 'sama2 gan', 'message_stopwords': 'okedeh thanks', 'message_stemmed': 'okedeh thanks', 'message_tokenized': ['okedeh', 'thanks']}
{'message': 'normal kan', 'category': 'kondisi barang', 'response': 'iya gan, sesuai deskripsi', 'message_stopwords': 'normal kan', 'message_stemmed': 'normal kan', 'message_tokenized': ['normal', 'kan']}
{'message': 'itu iphon  plus apa yang  biasa gan', 'category': 'stok barang', 'response': 'Ini ip 7 biasa gan', 'message_stopwords': 'iphon  plus apa  biasa gan', 'message_stemmed': 'iphon plus apa biasa gan', 'message_tokenized': ['iphon', 'plus', 'apa', 'biasa', 'gan']}
{'message': 'oke gansaya ambil tolong pilihkan stabil rom nya dan aman ya gan dan yg pasti produknya yg new dan bagus ya', 'category': 'konfirmasi', 'response': 'ok siap', 'message_stopwords': 'oke gansaya ambil pilihkan stabil rom nya aman gan yg produknya yg new bagus', 'message_stemmed': 'oke gansaya ambil pilih stabil r

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
datatrain, datatest = train_test_split(dataset, test_size=0.2)

In [57]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform([row['message'] for row in datatrain])
X_train_counts.shape

(400, 508)

In [30]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(400, 508)

In [67]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', svm.SVC(C=1, gamma=1.0)),
])

_ = text_clf_svm.fit([row['message'] for row in datatrain], [row['category'] for row in datatrain])

In [68]:
predicted_svm = text_clf_svm.predict([row['message'] for row in datatest])
np.mean(predicted_svm == [row['category'] for row in datatest])

0.7029702970297029

In [69]:
y_true = [row['category'] for row in datatest]
y_pred = predicted_svm
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.7781051253273475, 0.5815728599545025, 0.6085693935693937, None)