In [None]:
import os
import sys
import gensim
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from gensim.models.doc2vec import LabeledSentence
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np
from keras import optimizers
from keras.models import load_model
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/train.csv")
# clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_train_full.csv")
clean_train_comments.head()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
clean_train_comments['tokens'] = clean_train_comments['content'].apply(tokenizer.tokenize)
clean_train_comments['sentiment'] = clean_train_comments['polarity'].astype('category').cat.codes
   
clean_train_comments.head()

In [None]:
clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/test.csv")
# clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_testing_full.csv")
clean_test_comments.head()

In [None]:
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments["tokens"] = clean_test_comments["content"].apply(tokenizer.tokenize)
clean_test_comments['sentiment'] = clean_test_comments['polarity'].astype('category').cat.codes

clean_test_comments.head()

In [None]:
x_train = clean_train_comments['content']
x_validation = clean_test_comments['content']
y_train = clean_train_comments['sentiment']
y_validation = clean_test_comments['sentiment']

In [None]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
x_train = labelize_text(x_train, 'TRAIN')
x_validation = labelize_text(x_validation, 'TEST')

In [None]:
MAX_SEQUENCE_LENGTH = 85
data_dim = 500

In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load("./vectorizer/tripadvisor/word2vec_300.model")

In [None]:
def build_Word_Vector(tokens, size):
    vec = np.zeros((MAX_SEQUENCE_LENGTH - len(tokens), size))
    for word in tokens:
        try:
            vec = np.append(vec, word2vec[word])
        except KeyError: 
            vec = np.append(vec, np.zeros((1, size)))
            continue
    vec.reshape(MAX_SEQUENCE_LENGTH, size)
    return vec

In [None]:
train_vecs = np.concatenate([[build_Word_Vector(z, 500)] for z in tqdm(map(lambda x: x.words, x_train))])
val_vecs = np.concatenate([[build_Word_Vector(z, 500)] for z in tqdm(map(lambda x: x.words, x_validation))])

In [None]:
batch_size = 256
num_epochs = 10
hidden_size = 10
timesteps = MAX_SEQUENCE_LENGTH
num_class = 1

In [None]:
num_data = len(train_vecs)
num_data_val = len(val_vecs)

train_vecs = train_vecs.reshape((num_data, timesteps, data_dim))
y_train = y_train.reshape((num_data, num_class))
val_vecs = val_vecs.reshape((num_data_val, timesteps, data_dim))
y_validation = y_validation.reshape((num_data_val, num_class))

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(hidden_size, input_shape=(timesteps, data_dim)), merge_mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_vecs, to_categorical(y_train), epochs=num_epochs, validation_data=(val_vecs, to_categorical(y_validation)))

In [None]:
# model.save('./model/bi_lstm_3/bi_lstm_model_01.h5')  

In [None]:
# model = load_model('./model/bi_lstm_3/bi_lstm_model_01.h5')
y_pred = model.predict(val_vecs)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_validation, y_pred, labels = [0, 1, 2], digits=8))