## Import Module and Library

In [None]:
from Reader import Reader
from VectorBuilder import VectorBuilder
from DocTokenizer import DocTokenizer
from gensim.models.word2vec import Word2Vec
from keras.preprocessing.text import Tokenizer
from DataSplitter import DataSplitter
from DnnModel import DnnModel
from Evaluator import Evaluator
from PvModel import PvModel
import numpy as np

## Pembentukan Model Vektor Paragraf

In [None]:
corpus_name = 'prosa'

In [None]:
corpus = Reader()
corpus.read_corpus("../resources/corpus/prosa/data_clean_punctuation/all_data.csv")

tokenizer = DocTokenizer()
pv_input = tokenizer.fit_corpus(corpus.data_frame['content'])
pv_model = PvModel()
pv_model.create_pv_model(corpus_name, pv_input)

## Pembangunan Model Klasifikasi Sentimen

In [None]:
embedding_size = 500
max_vocab = 19962
max_sequence = 95 #85
hierarchy = True
paragraph_vec = False
doc_vector = 'prosa'
trainable = False
# bi-rnn model
rnn_type = 'bi-lstm'
rnn_unit = 128
# cnn model
extra_conv = True
cnn_kernel = [1,2,3]
# hierarchical model
max_sents = 15
max_sen_len = 30 
dnn_sent_level = 'lstm'
dnn_doc_level = 'bi-gru'
lstm_unit = 128
grnn_unit = 128
model_path ='../model/prosa/cnn/cnn_model_99.h5'
num_epochs = 1
batch_size = 256

In [None]:
data_train = Reader()
data_train.read_file("../resources/corpus/prosa/data_clean/prosa_data_train.csv", hierarchy)

In [None]:
word2vec = Word2Vec.load('../resources/prosa-w2v/prosa.vec')

tokenizer = Tokenizer(num_words=max_vocab, lower=True, char_level=False)
tokenizer.fit_on_texts(data_train.data_frame['content'].tolist())
train_word_index = tokenizer.word_index

train_embedding = np.zeros((len(train_word_index)+1, embedding_size))
for word,index in train_word_index.items():
    train_embedding[index,:] = word2vec[word] if word in word2vec else np.random.rand(embedding_size)

In [None]:
vector_builder = VectorBuilder(word2vec, embedding_size, max_vocab, max_sequence, paragraph_vec, corpus=data_train.data_frame['content'], doc_vector=doc_vector)

if (paragraph_vec):
    data = data_train.data_frame['tokens']
else:
    data = data_train.data_frame['content']
    
if (hierarchy):
    train = vector_builder.build_hierarchy_sequences(data, max_sents, max_sen_len)
else:
    train = vector_builder.build_sequences(data)
    
label_train = data_train.data_frame['sentiment'].values

In [None]:
exp_data = DataSplitter()
exp_data.split(train, label_train, 0.1)

In [None]:
model = DnnModel(train_embedding, embedding_size, max_sequence, paragraph_vec)
model.create_hierarchy_model(max_sents, max_sen_len, dnn_sent_level, dnn_doc_level, trainable, lstm_unit=lstm_unit, grnn_unit=grnn_unit)
model.fit(exp_data.x_train, exp_data.y_train, exp_data.x_validation, exp_data.y_validation, num_epochs, batch_size, model_path)

## Evaluasi Model Klasifikasi Sentimen

In [None]:
embedding_size = 500
max_vocab = 19962
max_sequence = 95 #85
hierarchy = True
paragraph_vec = False
doc_vector = 'prosa'
# hierarchical model
max_sents = 15
max_sen_len = 30 

In [None]:
data_train = Reader()
data_train.read_file("../resources/corpus/prosa/data_clean/prosa_data_train.csv", hierarchy)

data_test = Reader()
data_test.read_file("../resources/corpus/prosa/data_clean/prosa_data_test.csv", hierarchy)

In [None]:
word2vec = Word2Vec.load('../resources/prosa-w2v/prosa.vec')
vector_builder = VectorBuilder(word2vec, embedding_size, max_vocab, max_sequence, paragraph_vec, corpus=data_train.data_frame['content'], doc_vector=doc_vector)

if (paragraph_vec):
    data = data_test.data_frame['tokens']
else:
    data = data_test.data_frame['content']
    
if (hierarchy):
    test = vector_builder.build_hierarchy_sequences(data, max_sents, max_sen_len)
else:
    test = vector_builder.build_sequences(data)
    
label_test = data_test.data_frame['sentiment'].values

In [None]:
model = DnnModel([], embedding_size, max_sequence, paragraph_vec)
model_path ='../model/prosa/cnn_bi_lstm_model.h5'
model.load_model(model_path)
y_pred = model.predict(test)
evaluator = Evaluator(label_test)
evaluator.show_evaluation(y_pred)