In [None]:
# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 40 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)

In [None]:
import numpy as np
import re, sys, csv, pickle
from tqdm import tqdm_notebook

from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
from keras import backend as K

In [None]:
texts = []   # empty list for model input: the movie reviews
labels = []  # empty lists model output: sentiment labels

data = open("../datasets/quote.tok.gt9.5000", "r", errors='ignore') 
for line in tqdm_notebook(data, total=5000): 
    texts.append(str(line).rstrip('\n').strip())
    labels.append(int(1)) # subjective

In [None]:
data = open("../datasets/plot.tok.gt9.5000", "r") 
for line in tqdm_notebook(data, total=5000): 
    texts.append(str(line).rstrip('\n').strip())
    labels.append(int(0)) # objective

In [None]:
import  matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.style.use('ggplot')
lengths = np.array([len(row.split(" ")) for row in texts])
summary = "mean: "+str(int(np.mean(lengths)))+" words , min/max: "+str(np.min(lengths))+"/"+str(np.max(lengths))+" (90%: "+ str(round(np.percentile(lengths, 90), 2)) + ")"
plt.figure(1, figsize=(10,6))
plt.hist(lengths, bins='auto')
plt.title("Distribution of text lengths")
plt.xlabel("Text Length: " + summary); plt.ylabel("Examples")
plt.axvline(np.mean(lengths), ls="-", color="k")
plt.axvline(np.percentile(lengths, 90), ls="--", color="k")
plt.show()

In [None]:
print("Sample subjective:", texts[0], labels[0])
print("Sample objective:", texts[9000], labels[9000])

In [None]:
MAX_SEQUENCE_LENGTH = 20
VALIDATION_SPLIT = 0.3
EMBEDDING_DIM = 100
GLOVE_DIR = "../glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [None]:
tokenizer = Tokenizer(num_words=2e6)
tokenizer.fit_on_texts(texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))

In [None]:
sequences = tokenizer.texts_to_sequences(texts)

data = pad_sequences(sequences, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))
labels = to_categorical(np.asarray(labels))
print('[+] Shape of data tensor:', data.shape)
print('[+] Shape of label tensor:', labels.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:\n",y_train.sum(axis=0))
print("[+] Validation:\n",y_val.sum(axis=0))

In [None]:
print("Sentence input" , texts[0])
print(" ")
print("One-hot label", labels[0])

In [None]:
embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)

In [None]:
l_conv_3 = Conv1D(filters=128,kernel_size=3,activation='relu')(embedded_sequences)
l_conv_3 = Dropout(0.5)(l_conv_3)

l_conv_5 = Conv1D(filters=128,kernel_size=5,activation='relu')(embedded_sequences)
l_conv_5 = Dropout(0.5)(l_conv_5)

l_conv_7 = Conv1D(filters=128,kernel_size=7,activation='relu')(embedded_sequences)
l_conv_7 = Dropout(0.5)(l_conv_7)

l_conv = Concatenate(axis=1)([l_conv_3, l_conv_5, l_conv_7])

In [None]:
l_pool = MaxPooling1D(2)(l_conv)
l_drop = Dropout(0.5)(l_pool)
l_flat = Flatten()(l_drop)
l_dense = Dense(32, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

In [None]:
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.02, amsgrad=False)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['acc'])
model.summary()

In [None]:
print("Training Progress:")
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                               epochs=60, batch_size=64)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.plot(model_log.history['acc'])
plt.plot(model_log.history['val_acc'])
plt.title('Accuracy (Higher Better)')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.plot(model_log.history['loss'])
plt.plot(model_log.history['val_loss'])
plt.title('Loss (Lower Better)')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools

classes = ["subjective", "objective"]

In [None]:
Y_test = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict(x_val)
y_pred_class = np.argmax(y_pred,axis=1)
print(classification_report(Y_test, y_pred_class, target_names=classes))

In [None]:
plt.style.use('seaborn-dark')
def plot_confusion_matrix(cm, labels,
                          normalize=True,
                          title='Confusion Matrix (Validation Set)',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        #print('Confusion matrix, without normalization')
        pass

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.figure(figsize=(14,7))
cnf_matrix = confusion_matrix(Y_test, y_pred_class)
plot_confusion_matrix(cnf_matrix, labels=classes)

In [None]:
model.save_weights("subjectivity.h5")