In [1]:
import pandas as pd

df = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names = ['Text', 'Label'])
df.head()

Unnamed: 0,Text,Label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [2]:
df.shape

(11000, 2)

In [3]:
df.Label.value_counts()

positive    6416
negative    3436
neutral     1148
Name: Label, dtype: int64

<h1>Cleansing

In [4]:
import re

def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]',' ',string)
    return string

df['text_clean'] = df.Text.apply(cleansing)
df

Unnamed: 0,Text,Label,text_clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus dan k212 mmbri hujjah partai...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis di jalan sumatera bandung t...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh jadi mahasiswa jangan sombong dong kas...
...,...,...,...
10995,tidak kecewa,positive,tidak kecewa
10996,enak rasa masakan nya apalagi kepiting yang me...,positive,enak rasa masakan nya apalagi kepiting yang me...
10997,hormati partai-partai yang telah berkoalisi,neutral,hormati partai partai yang telah berkoalisi
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative,pagi pagi di tol pasteur sudah macet parah b...


In [5]:
data_preprocessed = df.text_clean.tolist()

<h1>Feature Extraction

In [6]:
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords as stopwords_scratch

# list_stopwords = stopwords_scratch.words('indonesian')
# list_stopwords_en = stopwords_scratch.words('english')
# list_stopwords.extend(list_stopwords_en)
# list_stopwords.extend(['ya', 'yg', 'yuk', 'dah', 'nya', 'sih'])
# stopwords = list_stopwords

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# count_vect = CountVectorizer(stop_words=stopwords)
count_vect = CountVectorizer()
count_vect.fit(data_preprocessed)

X = count_vect.transform(data_preprocessed)
print("Feature Extraction Selesai")

Feature Extraction Selesai


In [8]:
import pickle

pickle.dump(count_vect, open("feature.p", "wb"))

<h1>Neural Network

In [9]:
from sklearn.model_selection import train_test_split

classes = df.Label
classes

0        positive
1         neutral
2        positive
3        positive
4        negative
           ...   
10995    positive
10996    positive
10997     neutral
10998    negative
10999    positive
Name: Label, Length: 11000, dtype: object

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size = 0.2)

In [11]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train, y_train)

print("Training Selesai")

Training Selesai


In [12]:
pickle.dump(model, open("model.p", "wb"))

In [13]:
test = model.predict(X_test)

print("Testing Selesai")

Testing Selesai


In [14]:
test[:10]

array(['positive', 'positive', 'positive', 'positive', 'negative',
       'neutral', 'positive', 'neutral', 'negative', 'negative'],
      dtype='<U8')

In [15]:
y_test[:10].values

array(['positive', 'positive', 'positive', 'positive', 'negative',
       'neutral', 'positive', 'neutral', 'positive', 'negative'],
      dtype=object)

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, test))

              precision    recall  f1-score   support

    negative       0.77      0.80      0.78       662
     neutral       0.81      0.67      0.73       218
    positive       0.89      0.90      0.90      1320

    accuracy                           0.85      2200
   macro avg       0.83      0.79      0.80      2200
weighted avg       0.85      0.85      0.85      2200



In [17]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5,random_state=42,shuffle=True)

accuracies = []

y = classes

for iteration, data in enumerate(kf.split(X), start=1):
    
    data_train = X[data[0]]
    target_train = y[data[0]]
    
    data_test = X[data[1]]
    target_test = y[data[1]]
    
    clf = MLPClassifier()
    
    clf.fit(data_train, target_train)
    
    preds = clf.predict(data_test)
    accuracy = accuracy_score(target_test, preds)
    
    print("Training ke-", iteration)
    print(classification_report(target_test, preds))
    print("========================================================")
    
    accuracies.append(accuracy)
    
avg_acc = np.mean(accuracies)
pickle.dump(clf, open("modelNN2.p", "wb"))

print()
print()
print("Rata-rata akurasi: ", avg_acc)

Training ke- 1
              precision    recall  f1-score   support

    negative       0.77      0.78      0.78       680
     neutral       0.75      0.63      0.68       239
    positive       0.87      0.89      0.88      1281

    accuracy                           0.83      2200
   macro avg       0.80      0.77      0.78      2200
weighted avg       0.83      0.83      0.83      2200

Training ke- 2
              precision    recall  f1-score   support

    negative       0.81      0.77      0.79       706
     neutral       0.73      0.71      0.72       220
    positive       0.88      0.91      0.89      1274

    accuracy                           0.84      2200
   macro avg       0.80      0.79      0.80      2200
weighted avg       0.84      0.84      0.84      2200

Training ke- 3
              precision    recall  f1-score   support

    negative       0.80      0.80      0.80       682
     neutral       0.85      0.72      0.78       215
    positive       0.89      0

In [19]:
# teks bebas
teks = '''
Ga bersyukur banget jadi orang sombong.
'''

text = count_vect.transform([cleansing(teks)])

result = model.predict(text)[0]
print("Sentiment: ")
print()
print(result)

Sentiment: 

negative
