In [1]:
import sklearn
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from utils.dataset import get_sentences, get_corpora, get_top_author_gut_idx

import os
import pickle
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # surpress tensorflow warnings

Using TensorFlow backend.


# Acquire the dataset
First time this process takes around 10 minutes with fast internet connection.
Later the texts are taken from a local mysql cache and the process is much faster (~1m)`

In [2]:
USE_PICKLED_CORPORA = True

if USE_PICKLED_CORPORA:
    with open('corpora.pkl', 'rb') as f:
        corpora = pickle.load(f)
else:
    # seed ensures we obtain author corpus from the same books each time.
    # If set to None, we will sample at random
    corpora = get_corpora(max_chars_per_author = 1e7, random_seed=42) 

# Tokenize the input

In [3]:
VOCAB = 30000
t = keras.preprocessing.text.Tokenizer(
    num_words=VOCAB, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    split=' ', char_level=False, oov_token=None, document_count=0)

In [4]:
t.fit_on_texts(list(corpora.values()))

# Convert corpora into sequences

In [5]:
corpora_seq = {}
SEQ_LEN = 100
for author in corpora:
    s = t.texts_to_sequences([corpora[author]])
    corpus_seq = np.array(s).squeeze()
    
    # reject last corp_len%seq_len words
    corp_len = corpus_seq.shape[0]
    aligned_len = corp_len - corp_len%SEQ_LEN 
    sequences = corpus_seq[:aligned_len].reshape(-1,100)
    
    corpora_seq[author] = sequences

# Make vectors X, y
X - array of shape (n_samples, seq_len)

y - array of lables (n_samples, n_classes)

In [6]:
X, y = [], []
for author, sequence in corpora_seq.items():
    X.append(sequence)
    y.append(len(sequence)*[author])
X = np.vstack(X)
y = np.concatenate(y)

In [7]:
le = sklearn.preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
y = keras.utils.to_categorical(y)

In [8]:
X, X_test, y, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1, stratify=y)

# 1D CNN Neural Net Classifier

In [9]:
def make_model(VOCAB, BATCH_SIZE, SEQ_LEN, N_CLASSSES, LR):
    model = keras.Sequential()
    model.add(layers.Embedding(VOCAB, BATCH_SIZE, input_length=SEQ_LEN))

    model.add(layers.Conv1D(filters=250,
                            kernel_size=3,
                            padding='valid',
                            activation='relu',
                            strides=1))

    # we use max pooling:
    model.add(layers.GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(layers.Dense(250, activation='relu'))
    # model.add(layers.Dropout(0.2))
    model.add(layers.Dense(N_CLASSSES, activation='sigmoid'))
    
    optim = keras.optimizers.adam(lr=LR)
    model.compile(loss='binary_crossentropy',
                  optimizer=optim,
                  metrics=['accuracy', keras.metrics.categorical_accuracy])
    
    return model

# Training on the entire train dataset
(no k-fold cross-validation at this point)

In [16]:
N_CLASSSES = len(np.unique(np.argmax(y, axis=1)))
BATCH_SIZE = 64
EPOCHS = 20
LR = 1.62e-3 # chosen in k-fold cross-validation

model = make_model(VOCAB, BATCH_SIZE, SEQ_LEN, N_CLASSSES, LR)

lr_sched = keras.callbacks.LearningRateScheduler(lambda epoch, lr: LR if epoch <= 3 else LR/10)
model.fit(X, y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=None,
          callbacks=[lr_sched])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f47cfd07150>

# Testing

In [17]:
scores = model.predict(X_test, batch_size=64)
pred = np.argmax(scores, axis=1)
truth = np.argmax(y_test, axis=1)

# predictions are batch-aligned
pred = pred[:len(truth)]
(pred == truth).sum()/len(truth)

0.9200853125493706

# 🥳

# Save necesary components

In [12]:
model.save('model.h5')

In [13]:
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [14]:
with open('corpora.pkl', 'wb') as f:
    pickle.dump(corpora, f)

In [15]:
with open('tokenizer.json', 'w') as f:
    f.write(t.to_json())

# Inference examples in the next notebook!