In [1]:
#import modul/library
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

from Cleansing import clean

## EDA & Pre Processing

In [2]:
#read data training
trains = pd.read_csv("dataset/train_preprocess.tsv.txt", sep='\t', names=['Kalimat','Sentiment'])
trains.head()

Unnamed: 0,Kalimat,Sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [3]:
#check data volume
trains.shape

(11000, 2)

In [5]:
#check data null
trains.isna().sum()

Kalimat      0
Sentiment    0
dtype: int64

In [6]:
#check total data duplicated
trains.duplicated().sum()

67

In [13]:
#Remove data duplicated
trainn = trains.drop_duplicates(inplace=False, ignore_index=True)

In [14]:
#check data voulume after remove duplicated
trainn.shape

(10933, 2)

In [15]:
#check distribution label
trainn.Sentiment.value_counts()

positive    6383
negative    3412
neutral     1138
Name: Sentiment, dtype: int64

In [16]:
#Clean data training
trainn['Clean'] = trainn.Kalimat.apply(clean)
trainn.head()

Unnamed: 0,Kalimat,Sentiment,Clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung dimiliki pengusaha pabrik puluhan terke...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus k212 mmbri hujjah partai diw...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung nya nya...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya unboxing paket barang nya b...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh mahasiswa sombong kasih kartu kuning bela...


## Feature Extraction

In [17]:
dataprep = trainn.Clean.to_list()

In [18]:
cv_nn = CountVectorizer()
cv_nn.fit(dataprep)

X = cv_nn.transform(dataprep)
print('Feature Extraction done')
print (X)

Feature Extraction done
  (0, 918)	1
  (0, 1013)	1
  (0, 1163)	1
  (0, 1488)	1
  (0, 1713)	1
  (0, 2098)	1
  (0, 3742)	1
  (0, 3794)	2
  (0, 4086)	1
  (0, 5365)	1
  (0, 5759)	1
  (0, 6297)	1
  (0, 6474)	1
  (0, 7329)	1
  (0, 8398)	1
  (0, 8470)	1
  (0, 9197)	2
  (0, 9228)	1
  (0, 10067)	1
  (0, 10306)	1
  (0, 10998)	1
  (0, 11786)	1
  (0, 11824)	1
  (0, 12861)	1
  (0, 14113)	1
  :	:
  (10932, 911)	1
  (10932, 1168)	1
  (10932, 1224)	1
  (10932, 1383)	1
  (10932, 1485)	1
  (10932, 1867)	1
  (10932, 1999)	1
  (10932, 4020)	1
  (10932, 4763)	1
  (10932, 4764)	1
  (10932, 6212)	1
  (10932, 6312)	1
  (10932, 7406)	1
  (10932, 7735)	1
  (10932, 7873)	1
  (10932, 8026)	1
  (10932, 8027)	2
  (10932, 10067)	3
  (10932, 11410)	1
  (10932, 11411)	1
  (10932, 12207)	1
  (10932, 12243)	1
  (10932, 14653)	1
  (10932, 14809)	1
  (10932, 15531)	1


In [19]:
pickle.dump(cv_nn, open('asset/feature/feature_nn.pickle','wb'))

## Train Model

In [20]:
Y = trainn.Sentiment
Y

0        positive
1         neutral
2        positive
3        positive
4        negative
           ...   
10928     neutral
10929    positive
10930    positive
10931    negative
10932    positive
Name: Sentiment, Length: 10933, dtype: object

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [22]:
model_nn = MLPClassifier()
model_nn.fit(X_train, Y_train)

print('Training selesai')

Training selesai


In [23]:
pickle.dump(model_nn, open('asset/model/model_nn.pickle','wb'))

In [24]:
test = model_nn.predict(X_test)

print ('testing selesai')
print(classification_report(Y_test, test))

testing selesai
              precision    recall  f1-score   support

    negative       0.71      0.74      0.73       699
     neutral       0.74      0.52      0.61       246
    positive       0.85      0.88      0.86      1242

    accuracy                           0.79      2187
   macro avg       0.77      0.71      0.73      2187
weighted avg       0.79      0.79      0.79      2187



In [25]:
# Create a KFold object with 5 splits and a fixed random state
Kf = KFold(n_splits=5, shuffle=True, random_state=42)
akurasi = []

In [26]:
for iteration, data in enumerate(Kf.split(X), start=1):
    data_train = X[data[0]]
    target_train = Y[data[0]]

    data_test = X[data[1]]
    target_test = Y[data[1]]

    clf = MLPClassifier()
    clf.fit(data_train, target_train)
    
    pred = clf.predict(data_test)
    akrsi = accuracy_score(target_test,pred)

    print("Training ke: ", iteration)
    print("---")
    print(classification_report(target_test,pred))

akurasi.append(akrsi)
print("rata-rata akurasi: ", np.mean(akurasi))

Training ke:  1
---
              precision    recall  f1-score   support

    negative       0.71      0.73      0.72       690
     neutral       0.71      0.53      0.61       220
    positive       0.84      0.87      0.86      1277

    accuracy                           0.79      2187
   macro avg       0.76      0.71      0.73      2187
weighted avg       0.79      0.79      0.79      2187

Training ke:  2
---
              precision    recall  f1-score   support

    negative       0.72      0.71      0.72       667
     neutral       0.70      0.57      0.63       219
    positive       0.85      0.89      0.87      1301

    accuracy                           0.80      2187
   macro avg       0.76      0.72      0.74      2187
weighted avg       0.80      0.80      0.80      2187

Training ke:  3
---
              precision    recall  f1-score   support

    negative       0.72      0.70      0.71       695
     neutral       0.70      0.57      0.63       213
    positive   

## Predict

In [27]:
ori = '''
sabar sik ae rek'''

te = cv_nn.transform([clean(ori)])
res = model_nn.predict(te)[0]

print(ori)
print('sentiment:', res)


sabar sik ae rek
sentiment: positive
