In [66]:
#import library
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

from Cleansing import clean

In [67]:
#read data training
train = pd.read_csv("dataset/train_preprocess.tsv.txt", sep='\t', names=['Kalimat','Sentiment'])
train.head()

Unnamed: 0,Kalimat,Sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [68]:
#cek data volume
train.shape

(11000, 2)

In [69]:
train.isna().sum()

Kalimat      0
Sentiment    0
dtype: int64

In [70]:
train.Sentiment.value_counts()

positive    6416
negative    3436
neutral     1148
Name: Sentiment, dtype: int64

In [71]:
train.duplicated().sum()

67

In [72]:
train.drop_duplicates()

Unnamed: 0,Kalimat,Sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10993,f - demokrat dorong upaya kemandirian energi n...,neutral
10994,tidak bosan,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [73]:
train.Sentiment.value_counts()

positive    6416
negative    3436
neutral     1148
Name: Sentiment, dtype: int64

In [74]:
train['Clean'] = train.Kalimat.apply(clean)
train.head()

Unnamed: 0,Kalimat,Sentiment,Clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung dimiliki pengusaha pabrik puluhan terke...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus k212 mmbri hujjah partai diw...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung nya nya...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya unboxing paket barang nya b...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh mahasiswa sombong kasih kartu kuning bela...


In [133]:
dataprep = train.Clean.to_list()
dataprep

['warung dimiliki pengusaha pabrik puluhan terkenal putih bandung berkualitas dipadu keahlian memasak dipadu kretivitas warung menyajikan menu utama berbahan ditambah menu ayam selera indonesia harga terjangkau melewati bletoka nya kalah asli tegal',
 'mohon ulama lurus k212 mmbri hujjah partai diwlh suara islam pecah pecah',
 'lokasi strategis jalan sumatra bandung nya nyaman sofa lantai 2 paella nya enak pas dimakan minum bir dingin appetiser nya enak enak',
 'betapa bahagia nya unboxing paket barang nya bagus menetapkan beli',
 'aduh mahasiswa sombong kasih kartu kuning belajar usahlah politik selesai kuliah nya politik telat dasar mahasiswa',
 'makanan beragam harga makanan food stall 10 kasir suasana ramai perhatian parkir duduk',
 'pakai kartu kredit baca untung rugi',
 'unik bagus foto makanan enak pegawai ramah bersih luas wifi kencang harga standar sesuai nya menu masakan makanan barat indonesia menu favorit lychee mojito spagheti',
 'keluarga menikmati pengalaman kuliner meny

In [134]:
cv = CountVectorizer()
cv.fit(dataprep)

X = cv.transform(dataprep)
print('Feature Extraction done')
print (X)

Feature Extraction done
  (0, 918)	1
  (0, 1013)	1
  (0, 1163)	1
  (0, 1488)	1
  (0, 1713)	1
  (0, 2098)	1
  (0, 3742)	1
  (0, 3794)	2
  (0, 4086)	1
  (0, 5365)	1
  (0, 5759)	1
  (0, 6297)	1
  (0, 6474)	1
  (0, 7329)	1
  (0, 8398)	1
  (0, 8470)	1
  (0, 9197)	2
  (0, 9228)	1
  (0, 10067)	1
  (0, 10306)	1
  (0, 10998)	1
  (0, 11786)	1
  (0, 11824)	1
  (0, 12861)	1
  (0, 14113)	1
  :	:
  (10999, 911)	1
  (10999, 1168)	1
  (10999, 1224)	1
  (10999, 1383)	1
  (10999, 1485)	1
  (10999, 1867)	1
  (10999, 1999)	1
  (10999, 4020)	1
  (10999, 4763)	1
  (10999, 4764)	1
  (10999, 6212)	1
  (10999, 6312)	1
  (10999, 7406)	1
  (10999, 7735)	1
  (10999, 7873)	1
  (10999, 8026)	1
  (10999, 8027)	2
  (10999, 10067)	3
  (10999, 11410)	1
  (10999, 11411)	1
  (10999, 12207)	1
  (10999, 12243)	1
  (10999, 14653)	1
  (10999, 14809)	1
  (10999, 15531)	1


In [135]:
pickle.dump(cv, open('asset/feature.pickle','wb'))

In [136]:
Y = train.Sentiment
Y

0        positive
1         neutral
2        positive
3        positive
4        negative
           ...   
10995    positive
10996    positive
10997     neutral
10998    negative
10999    positive
Name: Sentiment, Length: 11000, dtype: object

In [137]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [138]:
model = MultinomialNB()
model.fit(X_train, Y_train)

print('Training selesai')

Training selesai


In [139]:
pickle.dump(model, open('asset/model.pickle','wb'))

In [142]:
test = model.predict(X_test)

print ('testing selesai')
print(classification_report(Y_test, test))

testing selesai
              precision    recall  f1-score   support

    negative       0.76      0.75      0.76       709
     neutral       0.82      0.64      0.72       230
    positive       0.87      0.91      0.89      1261

    accuracy                           0.83      2200
   macro avg       0.82      0.77      0.79      2200
weighted avg       0.83      0.83      0.83      2200



In [143]:
explainer = shap.Explainer(model.predict_proba, X_train)
shap_values = explainer(X_train[0])
shap.plots.text(shap_values)

DimensionError: The passed data does not match the background shape expected by the masker! The data of shape (1, 15611) was passed while the masker expected data of shape (15611,).

In [144]:
# Create a KFold object with 5 splits and a fixed random state
Kf = KFold(n_splits=5, shuffle=True, random_state=42)
akurasi = []

In [145]:
for iteration, data in enumerate(Kf.split(X), start=1):
    data_train = X[data[0]]
    target_train = Y[data[0]]

    data_test = X[data[1]]
    target_test = Y[data[1]]

    clf = MultinomialNB()
    clf.fit(data_train, target_train)
    
    pred = clf.predict(data_test)
    accuracy = accuracy_score(target_test,pred)

    print("Training ke: ", iteration)
    print("---")
    print(classification_report(target_test,pred))

akurasi.append(accuracy)
print("rata-rata akurasi: ", np.mean(akurasi))


Training ke:  1
---
              precision    recall  f1-score   support

    negative       0.74      0.73      0.74       680
     neutral       0.80      0.67      0.73       239
    positive       0.86      0.90      0.88      1281

    accuracy                           0.82      2200
   macro avg       0.80      0.76      0.78      2200
weighted avg       0.82      0.82      0.82      2200

Training ke:  2
---
              precision    recall  f1-score   support

    negative       0.78      0.75      0.76       706
     neutral       0.78      0.67      0.72       220
    positive       0.87      0.91      0.89      1274

    accuracy                           0.83      2200
   macro avg       0.81      0.78      0.79      2200
weighted avg       0.83      0.83      0.83      2200

Training ke:  3
---
              precision    recall  f1-score   support

    negative       0.78      0.74      0.76       682
     neutral       0.81      0.73      0.76       215
    positive   

In [146]:
ori = '''
bacot!!!! kamu terlalu banyak bicara'''

te = cv.transform([clean(ori)])
res = model.predict(te)[0]

print(ori)
print('sentiment:', res)


bacot!!!! kamu terlalu banyak bicara
sentiment: negative
