In [2]:
import numpy as np
import pandas as pd
import keras
from sklearn.metrics import accuracy_score, classification_report
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
import keras.layers.merge
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import nltk
from nltk.tokenize import RegexpTokenizer
import re
import codecs
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
EMBEDDING_DIM = 500 # how big is each word vector
MAX_VOCAB_SIZE = 175303 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 100 # max number of words in a comment to use

#training params
batch_size = 256  
num_epochs = 20 

In [4]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d+','<number>',text)
    return text

In [5]:
clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_train_full.csv")
clean_train_comments['content'] = clean_train_comments['content'].apply(preprocess)
clean_train_comments.head()

Unnamed: 0,content,polarity
0,gerindra alihkan rekomendasi ke agus an tanri ...,neutral
1,cuci tangan pakai sunlight stelah itu pakai sa...,neutral
2,kasus toko obat digerebek fpi propam akan peri...,neutral
3,menkeu melemah nya rupiah lebih berpengaruh pa...,neutral
4,minyak jarak castor oil <number> ml,neutral


In [6]:
tokenizer = RegexpTokenizer(r'\w+')
clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
clean_train_comments.dtypes
clean_train_comments['tokens'] = clean_train_comments['content'].apply(tokenizer.tokenize)
clean_train_comments['sentiment'] = clean_train_comments['polarity'].astype('category').cat.codes
   
clean_train_comments.head()

Unnamed: 0,content,polarity,tokens,sentiment
0,gerindra alihkan rekomendasi ke agus an tanri ...,neutral,"[gerindra, alihkan, rekomendasi, ke, agus, an,...",1
1,cuci tangan pakai sunlight stelah itu pakai sa...,neutral,"[cuci, tangan, pakai, sunlight, stelah, itu, p...",1
2,kasus toko obat digerebek fpi propam akan peri...,neutral,"[kasus, toko, obat, digerebek, fpi, propam, ak...",1
3,menkeu melemah nya rupiah lebih berpengaruh pa...,neutral,"[menkeu, melemah, nya, rupiah, lebih, berpenga...",1
4,minyak jarak castor oil <number> ml,neutral,"[minyak, jarak, castor, oil, number, ml]",1


In [7]:
clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_testing_full.csv")
clean_test_comments['content'] = clean_test_comments['content'].apply(preprocess)
clean_test_comments.head()

Unnamed: 0,content,polarity
0,kemarin gue datang ke tempat makan baru yang a...,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative
2,kalau dipikirpikir sebenarnya tidak ada yang b...,negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative


In [8]:
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments.dtypes
clean_test_comments["tokens"] = clean_test_comments["content"].apply(tokenizer.tokenize)
clean_test_comments['sentiment'] = clean_test_comments['polarity'].astype('category').cat.codes

clean_test_comments.head()

Unnamed: 0,content,polarity,tokens,sentiment
0,kemarin gue datang ke tempat makan baru yang a...,negative,"[kemarin, gue, datang, ke, tempat, makan, baru...",0
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative,"[kayak, nya, sih, gue, tidak, akan, mau, balik...",0
2,kalau dipikirpikir sebenarnya tidak ada yang b...,negative,"[kalau, dipikirpikir, sebenarnya, tidak, ada, ...",0
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative,"[ini, pertama, kalinya, gua, ke, bank, buat, n...",0
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative,"[waktu, sampai, dengan, gue, pernah, disuruh, ...",0


In [9]:
all_training_words = [word for tokens in clean_train_comments["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in clean_train_comments["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

235162 words total, with a vocabulary size of 16127
Max sentence length is 84


In [10]:
all_test_words = [word for tokens in clean_test_comments["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in clean_test_comments["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

10767 words total, with a vocabulary size of 2692
Max sentence length is 72


In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load('./vectorizer/prosa/word2vec.model')

In [14]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(clean_train_comments["content"].tolist())
training_sequences = tokenizer.texts_to_sequences(clean_train_comments["content"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

Found 16125 unique tokens.


In [18]:
test_sequences = tokenizer.texts_to_sequences(clean_test_comments["content"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [19]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    #l_merge = Merge(mode='concat', concat_axis=1)(convs)
    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(3, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc']) 
    model.summary()
    return model

In [20]:
y_tr = clean_train_comments['sentiment'].values
y_ts = clean_test_comments['sentiment'].values

In [21]:
x_train = train_cnn_data
y_train = y_tr

x_test = test_cnn_data
y_test = y_ts

In [22]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                1, False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 700)     11288200    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 98, 128)      268928      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 97, 128)      358528      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [18]:
#define callbacks
# early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.1, patience=4, verbose=1)
# callbacks_list = [early_stopping]

In [23]:
hist = model.fit(x_train, to_categorical(y_train), epochs=num_epochs, validation_data=(x_test, to_categorical(y_test)), batch_size=batch_size) #callbacks=callbacks_list

Train on 8939 samples, validate on 500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
# model.save('./model/yoon_kim_3/cnn_model_04.h5')  

In [25]:
# model = load_model('./model/yoon_kim_3/cnn_model_04.h5')

y_predict = model.predict(test_cnn_data, batch_size=256, verbose=1)
y_predict = np.argmax(y_predict, axis=1)
print(classification_report(y_test, y_predict, labels = [0, 1, 2], digits=8))

             precision    recall  f1-score   support

          0  0.74308300 0.92156863 0.82275711       204
          1  0.66197183 0.53409091 0.59119497        88
          2  0.85227273 0.72115385 0.78125000       208

avg / total  0.77423036 0.77000000 0.76473522       500

