In [None]:
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np
from keras import optimizers
from keras.models import load_model
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
# clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/train.csv")
clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_train_full.csv")
clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
clean_train_comments['tokens'] = clean_train_comments['content'].str.split()
clean_train_comments['sentiment'] = clean_train_comments['polarity'].astype('category').cat.codes
   
clean_train_comments.head()

In [None]:
# clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/test.csv")
clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_testing_full.csv")
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments["tokens"] = clean_test_comments["content"].str.split()
clean_test_comments['sentiment'] = clean_test_comments['polarity'].astype('category').cat.codes

clean_test_comments.head()

In [None]:
all_training_words = [word for tokens in clean_train_comments["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in clean_train_comments["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

In [None]:
all_test_words = [word for tokens in clean_test_comments["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in clean_test_comments["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load('./vectorizer/prosa/word2vec.model')

In [None]:
EMBEDDING_DIM = 500 
MAX_VOCAB_SIZE = 17872 
MAX_SEQUENCE_LENGTH = 95

#training params
batch_size = 256
num_epochs = 10
hidden_size = 100
timesteps = MAX_SEQUENCE_LENGTH

In [None]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(clean_train_comments["content"].tolist())
training_sequences = tokenizer.texts_to_sequences(clean_train_comments["content"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

In [None]:
test_sequences = tokenizer.texts_to_sequences(clean_test_comments["content"].tolist())
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
def create_model(embeddings, max_sequence_length, num_words, embedding_dim, trainable=True):
    model = Sequential()
    model.add(Embedding(num_words,
                        embedding_dim,
                        weights=[embeddings],
                        input_length=max_sequence_length,
                        trainable=trainable))
    
    model.add(Bidirectional(LSTM(hidden_size, input_shape=(timesteps, EMBEDDING_DIM)), merge_mode='concat'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    
    return model

In [None]:
y_tr = clean_train_comments['sentiment'].values
y_ts = clean_test_comments['sentiment'].values

In [None]:
x_train = train_data
y_train = y_tr

x_test = test_data
y_test = y_ts

In [None]:
model = create_model(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM)

In [None]:
model.fit(x_train, to_categorical(y_train), epochs=num_epochs, validation_data=(x_test, to_categorical(y_test)), batch_size=batch_size)

In [None]:
# model.save('./model/bi_lstm_3/bi_lstm_model_05.h5')  

In [None]:
# model = load_model('./model/bi_lstm_3/bi_lstm_model_05.h5')
y_pred = model.predict(test_data)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred, labels = [0, 1, 2], digits=8))