### Setup

In [1]:
from functions import *

# important parameters

classes = ["Murakami", "Abe", "Kafka", "Soseki", "Yoshimoto"]
focus_class = "Kafka"

sentences_per_section = 20
words_per_section = 20*15

### Data Loading and Prep

In [2]:
all_texts_paths = load_texts()

texts, labels = [ ], [ ]

for text in all_texts_paths:
    path_to_text, num_chapters = text[0], text[1]
    book_name = path_to_text.split("/")[-1]
    author_name = book_name.split("_")[0]
    new_text_sections = export_text_sections(path_to_text, sentences_per_section)
    texts += new_text_sections
    for _ in range(len(new_text_sections)):
        if author_name==focus_class: labels.append(1)
        else: labels.append(0)
            
# plot_distribution(texts, labels)

Number of Texts: 12
Authors: {'Kafka', 'Abe', 'Murakami', 'Yoshimoto', 'Soseki'}


In [3]:
GLOVE_URL = "https://s3-ap-southeast-1.amazonaws.com/deeplearning-mat/glove.6B.100d.txt.zip"
GLOVE_DIR = keras.utils.get_file("glove.6B.100d.txt.zip", GLOVE_URL, cache_subdir="datasets", extract=True)
print("GloVe data present at", GLOVE_DIR)
GLOVE_DIR = GLOVE_DIR.replace(".zip", "")

GloVe data present at /home/jovyan/.keras/datasets/glove.6B.100d.txt.zip


In [4]:
tokenizer = Tokenizer(filters="", lower=True, num_words=1e7)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print("[INFO] Vocabulary size:", len(word_index))

[INFO] Vocabulary size: 27941


In [5]:
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, padding="pre", maxlen=(words_per_section))

#labels = to_categorical(np.asarray(labels))
#labels = np.asarray(labels).reshape((len(labels), 1))
labels = np.asarray(labels)

print("[INFO] Shape of data tensor:", data.shape)
print("[INFO] Shape of label tensor:", labels.shape)

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, stratify=labels)

print('[INFO] Number of entries in each category:')
print("[INFO] Training:\t", len(y_train))
print("[INFO] Validation:\t", len(y_val))

[INFO] Shape of data tensor: (3391, 300)
[INFO] Shape of label tensor: (3391,)
[INFO] Number of entries in each category:
[INFO] Training:	 2373
[INFO] Validation:	 1018


In [6]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(GLOVE_DIR)
print("[i] (long) Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(" Completed!")

[i] (long) Loading GloVe from: /home/jovyan/.keras/datasets/glove.6B.100d.txt ...Done.
[+] Proceeding with Embedding Matrix... Completed!


### Model

In [7]:
sequence_input = Input(shape=(words_per_section,), dtype='int32') # input to the model

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=words_per_section,
                            trainable=True)

l_embed = embedding_layer(sequence_input)

l_act = LSTM(1, return_sequences=True, activation='relu',
             kernel_regularizer=regularizers.l2(0.001),
             activity_regularizer=regularizers.l1(0.001))(l_embed)

l_pool = GlobalAveragePooling1D(data_format='channels_first')(l_act)

preds = Dense(1, activation='sigmoid')(l_pool)

Instructions for updating:
Colocations handled automatically by placer.


In [8]:
model = Model(sequence_input, preds)
model.compile(loss="binary_crossentropy",
              optimizer="rmsprop",
              metrics=["acc"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 100)          2794200   
_________________________________________________________________
lstm (LSTM)                  (None, 300, 1)            408       
_________________________________________________________________
global_average_pooling1d (Gl (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 301       
Total params: 2,794,909
Trainable params: 2,794,909
Non-trainable params: 0
_________________________________________________________________


### Train Model

In [9]:
print("Training Progress:\n")

opt = keras.optimizers.RMSprop(lr=0.002, decay=0.01)

model.compile(loss="binary_crossentropy",
              optimizer=opt,
              metrics=["acc"])
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                      epochs=40, batch_size=128)

Training Progress:

Train on 2373 samples, validate on 1018 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
 512/2373 [=====>........................] - ETA: 4s - loss: 0.0812 - acc: 0.9922

KeyboardInterrupt: 

### Interpretation

In [10]:
layer_name = str(model.layers[3].name)
print("Truncated model ends at:", layer_name)
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

Truncated model ends at: global_average_pooling1d


In [11]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [12]:
# "positive" examples
start_index = 10
for index in range(start_index, start_index + 20):
    print("Positive:", labels[index])
    output = test_and_export_html(intermediate_layer_model,
                                  model, reverse_word_map,
                                  data[index], labels[index])
    display(HTML(output))

Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


In [13]:
# "negative" examples
start_index = 2500
for index in range(start_index, start_index + 20):
    print("Positive:", labels[index])
    output = test_and_export_html(intermediate_layer_model,
                                  model, reverse_word_map,
                                  data[index], labels[index])
    display(HTML(output))

Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0


Positive: 0
