# Load Datasets & Quick EDA

## FD Dataset 



In [1]:
import warnings 

warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

df_train = pd.read_csv("/content/drive/MyDrive/Data NusaX/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Data NusaX/test.csv")

df = df_train.append(df_test, ignore_index=True)

df_val = pd.read_csv("/content/drive/MyDrive/Data NusaX/valid.csv")

df = df.append(df_val, ignore_index=True)
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,text,label
0,Nikmati cicilan 0% hingga 12 bulan untuk pemes...,neutral
1,Kue-kue yang disajikan bikin saya bernostalgia...,positive
2,Ibu pernah bekerja di grab indonesia,neutral
3,Paling suka banget makan siang di sini ayam sa...,positive
4,Pelayanan bus DAMRI sangat baik,positive


In [4]:
df.shape

(1000, 2)

In [5]:
df.label.value_counts()#dataset_fd

negative    383
positive    378
neutral     239
Name: label, dtype: int64

## Challange Dataset

In [6]:
df_challange = pd.read_csv("/content/drive/MyDrive/platinum challenge/data.csv", encoding='ISO-8859-1')

In [7]:
cols = df_challange.columns[1:]

In [8]:
df_challange['label'] = df_challange[cols].any(axis=1).astype(int)

In [9]:
df_challange = df_challange.drop(columns=cols)

In [10]:
df_challange.columns = ['text','label']

In [11]:
df_challange.label.value_counts()

1    7309
0    5860
Name: label, dtype: int64

In [12]:
df_challange = df_challange.replace({1: 'negative', 0: 'positive'})

In [13]:
df_challange.head()

Unnamed: 0,text,label
0,- disaat semua cowok berusaha melacak perhatia...,negative
1,RT USER: USER siapa yang telat ngasih tau elu?...,negative
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",positive
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,positive
4,USER USER Kaum cebong kapir udah keliatan dong...,negative


## IndoNLU Dataset

In [14]:
df_indonlu_test = pd.read_csv("/content/drive/MyDrive/platinum challenge/test_preprocess.tsv", sep='\t', header= None)
df_indonlu_train = pd.read_csv("/content/drive/MyDrive/platinum challenge/train_preprocess.tsv", sep='\t', header= None)
df_indonlu_valid = pd.read_csv("/content/drive/MyDrive/platinum challenge/valid_preprocess.tsv", sep='\t', header= None)

df_indonlu = df_indonlu_train.append(df_indonlu_test, ignore_index=True)
df_indonlu = df_indonlu.append(df_indonlu_valid, ignore_index=True)
df_indonlu.columns = ['text','label']
df_indonlu.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [15]:
df_indonlu.label.value_counts()

positive    7359
negative    4034
neutral     1367
Name: label, dtype: int64

In [16]:
df_indonlu.shape

(12760, 2)

# Data Cleansing

In [17]:
print(df.shape, df_challange.shape, df_indonlu.shape)

(1000, 2) (13169, 2) (12760, 2)


In [18]:
print(df.columns, df_challange.columns, df_indonlu.columns)

Index(['text', 'label'], dtype='object') Index(['text', 'label'], dtype='object') Index(['text', 'label'], dtype='object')


In [19]:
df_final = df.append(df_challange, ignore_index=True)
df_final = df_final.append(df_indonlu, ignore_index=True)
df_final.shape

(26929, 2)

In [20]:
# df_final = df_final.drop(df_final[df_final['label'] == 'neutral'].index)

In [21]:
df_final.label.value_counts()

positive    13597
negative    11726
neutral      1606
Name: label, dtype: int64

In [22]:
df = df_final.copy()

In [23]:
import re
import pandas as pd
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian'))

def cleansing(sent):
  string = sent.lower()
  string = re.sub(r'[^a-zA-z0-9]',' ', string)
  string = re.sub(r'[^\w]',' ', string)

  words = nltk.word_tokenize(string)
  words = [word for word in words if word not in stop_words]

  text = ' '.join(words)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [24]:
df.columns

Index(['text', 'label'], dtype='object')

In [25]:
df.shape

(26929, 2)

In [26]:
df['text'] = df.text.apply(cleansing)
df.head()

Unnamed: 0,text,label
0,nikmati cicilan 0 12 pemesanan tiket pesawat a...,neutral
1,kue kue disajikan bikin bernostalgia tipikal k...,positive
2,grab indonesia,neutral
3,suka banget makan siang ayam sambalnya enak ba...,positive
4,pelayanan bus damri,positive


In [27]:
data_preprocessed = df.text.tolist()
type(data_preprocessed)

list

In [28]:
len(data_preprocessed)

26929

# Features Extraction

In [29]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

count_vect = TfidfVectorizer()
# count_vect = CountVectorizer()
count_vect.fit(data_preprocessed)

X = count_vect.transform(data_preprocessed)
print("Feature Extraction Done !")

Feature Extraction Done !


In [30]:
import pickle

pickle.dump(count_vect, open("feature.p", "wb"))

# Test Split 

In [31]:
from sklearn.model_selection import train_test_split

classes = df.label
classes

0         neutral
1        positive
2         neutral
3        positive
4        positive
           ...   
26924    negative
26925    negative
26926    negative
26927    negative
26928    positive
Name: label, Length: 26929, dtype: object

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size= 0.2)

# Modeling

## Simple NN , MLP

In [33]:
#training
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier()
model_mlp.fit(X_train, y_train)

print("Training Selesai !")

Training Selesai !


In [34]:
pickle.dump(model_mlp, open("model.p", "wb"))

In [35]:
#evaluations
from sklearn.metrics import classification_report


test = model_mlp.predict(X_test)

print("Testing Selesai !")

print(classification_report(y_test, test))

Testing Selesai !
              precision    recall  f1-score   support

    negative       0.80      0.82      0.81      2316
     neutral       0.72      0.58      0.65       299
    positive       0.83      0.83      0.83      2771

    accuracy                           0.81      5386
   macro avg       0.79      0.74      0.76      5386
weighted avg       0.81      0.81      0.81      5386



In [36]:
#predict
original_text = '''
barangnya bagus
'''

text = count_vect.transform([cleansing(original_text)])

result = model_mlp.predict(text)[0]
print("Sentiment: ", result)

Sentiment:  positive


## RNN

In [37]:
df.head()

Unnamed: 0,text,label
0,nikmati cicilan 0 12 pemesanan tiket pesawat a...,neutral
1,kue kue disajikan bikin bernostalgia tipikal k...,positive
2,grab indonesia,neutral
3,suka banget makan siang ayam sambalnya enak ba...,positive
4,pelayanan bus damri,positive


In [38]:
df.label.value_counts()

positive    13597
negative    11726
neutral      1606
Name: label, dtype: int64

In [39]:
neg = df.loc[df['label']=='negative'].text.tolist()
pos = df.loc[df['label']=='positive'].text.tolist()
net = df.loc[df['label']=='neutral'].text.tolist()


neg_label = df.loc[df['label']=='negative'].label.tolist()
pos_label = df.loc[df['label']=='positive'].label.tolist()
net_label = df.loc[df['label']=='neutral'].label.tolist()

total_data = pos+neg+net
labels = pos_label+neg_label + net_label

In [40]:
import pickle 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict

max_features = 100000
tokenizer = Tokenizer(num_words=max_features, split= ' ', lower=True)
tokenizer.fit_on_texts(total_data)
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer, handle, protocol= pickle.HIGHEST_PROTOCOL)
  print("tokenizer.pickle has created !")

X = tokenizer.texts_to_sequences(total_data)

vocab_size = len(tokenizer.word_index)
maxlen = max(len(x) for x in X)

X = pad_sequences(X)
with open('x_pad_sequences.pickle','wb') as handle:
  pickle.dump(X, handle, protocol = pickle.HIGHEST_PROTOCOL)
  print('x_pad_sequences.pickle has created !')

tokenizer.pickle has created !
x_pad_sequences.pickle has created !


In [41]:
Y = pd.get_dummies(labels)
Y = Y.values

with open('y_labels.pickle','wb') as handle:
  pickle.dump(Y, handle, protocol = pickle.HIGHEST_PROTOCOL)
  print('y_labels.pickle has created !')

y_labels.pickle has created !


In [42]:
from sklearn.model_selection import train_test_split

file = open('x_pad_sequences.pickle','rb')
X = pickle.load(file)
file.close()

file = open('y_labels.pickle','rb')
Y = pickle.load(file)
file.close()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state=1)

In [43]:
#training
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN, Activation
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import Flatten
from tensorflow.keras import backend as K

embed_dim = 100
units = 64

model_rnn = Sequential()
model_rnn.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model_rnn.add(SimpleRNN(units,dropout = 0.2))
model_rnn.add(Dense(3, activation='softmax'))
sgd= optimizers.Adam(lr=0.001)
model_rnn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model_rnn.summary())

adam = optimizers.Adam(lr=0.001)
model_rnn.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='main',verbose = 1)
history = model_rnn.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), verbose=1, callbacks=[es])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 114, 100)          10000000  
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                10560     
                                                                 
 dense (Dense)               (None, 3)                 195       
                                                                 
Total params: 10,010,755
Trainable params: 10,010,755
Non-trainable params: 0
_________________________________________________________________




None
Epoch 1/10
Epoch 2/10
Epoch 2: early stopping


In [44]:
model_rnn.save('model_rnn.h5')
print("Model has Creqated !")

Model has Creqated !


In [45]:
#evaluation 
from sklearn import metrics 

predictions = model_rnn.predict(X_test)
y_pred= predictions
matrix_test = metrics.classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Testing Done !")
print(matrix_test)

Testing Done !
              precision    recall  f1-score   support

           0       0.81      0.82      0.82      2274
           1       0.75      0.54      0.63       353
           2       0.82      0.85      0.84      2759

    accuracy                           0.82      5386
   macro avg       0.80      0.74      0.76      5386
weighted avg       0.82      0.82      0.81      5386



In [46]:
#prediction
import re
from keras.models import load_model 
input_text = '''
produknya kacau
'''
sentiment = ['positive','negative','neutral']

text = [cleansing(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model('model_rnn.h5')
prediction= model.predict(guess)
polarity = np.argmax(prediction[0])

print("Text: ", text[0])
print("Sentiment: ", sentiment[polarity])

Text:  produknya kacau
Sentiment:  positive


## LSTM

In [47]:
#training
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN, Activation
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import Flatten
from tensorflow.keras import backend as K

embed_dim = 100
units = 64

model_lstm = Sequential()
model_lstm.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model_lstm.add(LSTM(units,dropout = 0.2))
model_lstm.add(Dense(3, activation='softmax'))
sgd= optimizers.Adam(lr=0.001)
model_lstm.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model_lstm.summary())

adam = optimizers.Adam(lr=0.001)
model_lstm.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='main',verbose = 1)
history = model_lstm.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), verbose=1, callbacks=[es])



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 114, 100)          10000000  
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                                 
Total params: 10,042,435
Trainable params: 10,042,435
Non-trainable params: 0
_________________________________________________________________




None
Epoch 1/10
Epoch 2/10
Epoch 2: early stopping


In [48]:
model_lstm.save('model_lstm.h5')
print("Model has Creqated !")

Model has Creqated !


In [49]:
#evaluation
from sklearn import metrics 

predictions = model_lstm.predict(X_test)
y_pred= predictions
matrix_test = metrics.classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Testing Done !")
print(matrix_test)

Testing Done !
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      2274
           1       0.69      0.66      0.67       353
           2       0.84      0.88      0.86      2759

    accuracy                           0.84      5386
   macro avg       0.80      0.79      0.79      5386
weighted avg       0.84      0.84      0.84      5386



In [50]:
#prediction
import re
from keras.models import load_model 
input_text = '''
produknya kacau
'''
sentiment = ['positive','negative','neutral']

text = [cleansing(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model('/content/model_lstm.h5')
prediction= model.predict(guess)
polarity = np.argmax(prediction[0])

print("Text: ", text[0])
print("Sentiment: ", sentiment[polarity])

Text:  produknya kacau
Sentiment:  positive
