In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from keras import backend as K
from keras import initializers
from keras.engine.topology import Layer
from keras.utils.np_utils import to_categorical
from keras.models import load_model
from keras.utils import CustomObjectScope
from tensorflow import matmul

%matplotlib inline

Using TensorFlow backend.


In [2]:
with open('data_final/aws_data/X_train.pkl', 'rb') as infile:
    X_train = pickle.load(infile)
    
with open('data_final/aws_data/X_val.pkl', 'rb') as infile:
    X_val = pickle.load(infile)

with open('data_final/aws_data/X_test.pkl', 'rb') as infile:
    X_test = pickle.load(infile)

with open('data_final/aws_data/y_train.pkl', 'rb') as infile:
    y_train = pickle.load(infile)
    
with open('data_final/aws_data/y_val.pkl', 'rb') as infile:
    y_val = pickle.load(infile)

with open('data_final/aws_data/y_test.pkl', 'rb') as infile:
    y_test = pickle.load(infile)

In [3]:
def recombine(array):
    '''
    Rejoins the lists of words in the articles pre-formatted for training into a single string.
    
    Returns: String containing all the words in an article that was pre-formatted.
    '''
    return [' '.join(' '.join(sent) for sent in array)][0]

In [4]:
data = X_val.apply(recombine)
data = data.append(X_train.apply(recombine))
labels = y_val.append(y_train)

In [5]:
sample_data = data.sample(20000)
sample_indices = sample_data.index
sample_labels = labels[sample_indices]

In [6]:
counter = CountVectorizer(decode_error='ignore', strip_accents='unicode', max_features=50000)
counter.fit(data)
sample_data_counts = counter.transform(sample_data)

In [7]:
Cs = list(np.logspace(-4, 1, num=11))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5)
tfidf = TfidfTransformer()
tfidf.fit(data_counts)
lrcv = clf.fit(X=tfidf.transform(sample_data_counts), y=sample_labels)
lrcv.C_

array([ 3.16227766])

In [8]:
#  Since the best value determined for C was the greatest of the values tried, run again with larger values
np.random.seed(3)
Cs = list(np.linspace(1, 10, num=10))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5)
lrcv = clf.fit(X=tfidf.fit_transform(sample_data_counts), y=sample_labels)
lrcv.C_

array([ 4.])

In [9]:
Cs = list(np.linspace(1, 5, num=5))
ratios = list(np.linspace(0, 1, num=5))
clf = LogisticRegressionCV(Cs=Cs, l1_ratios=ratios, penalty='elasticnet', solver='saga', cv=5, max_iter=300)
lrcv = clf.fit(X=tfidf.transform(sample_data_counts), y=sample_labels)
lrcv.C_, lrcv.l1_ratio_



(array([ 4.]), array([ 0.]))

For logistic regression using either the sag or sag solver, a C value of 4 and the L2 penalty produce the optimal results.

In [10]:
test_strings = X_test.apply(recombine)
X_test_counts = counter.transform(test_strings)

In [13]:
data_counts = counter.transform(data)
lr = LogisticRegression(C=4.0, solver='sag', random_state=77, max_iter=300)
lr.fit(X=tfidf.transform(data_counts), y=labels)
lr.score(X=tfidf.transform(X_test_counts), y=y_test)

0.86896874999999996

In [14]:
lr = LogisticRegression(C=4.0, solver='saga', random_state=77, max_iter=300)
lr.fit(X=tfidf.transform(data_counts), y=labels)
lr.score(X=tfidf.transform(X_test_counts), y=y_test)

0.86896093750000003

In [15]:
counter = CountVectorizer(decode_error='ignore', strip_accents='unicode', max_features=150000)
counter.fit(data)
data_counts = counter.fit_transform(data)
X_test_counts = counter.transform(test_strings)

In [17]:
lr = LogisticRegression(C=4.0, solver='sag', random_state=77, max_iter=300)
lr.fit(X=tfidf.fit_transform(data_counts), y=labels)
lr.score(X=tfidf.transform(X_test_counts), y=y_test)

0.87283593749999999

Logistic regression after tuning for the value of C and either the L1 (only possiblefor the saga solver) or L2 penalty and with a 150,000-word vocabulary did slightly better than the simple cross-validation-fit accuracy of about 0.845 obtained as the benchmark in the data exploration notebook using a vocabulary of less than 5,000 words. Nonetheless, this is an impressive result compared to the HAN model trained using a cyclic learning rate, which achieved a slightly lower validation accuracy. The HAN model trained for 16 hours compared with 5 minutes or so for the logistic regression classifier. The HAN models using the adam optimizer, however, did obtain significantly higher validation scores just over 0.90. But these trained models need to be run on the test set.

In [19]:
max_words = 30  # max num words processed for each sentence
max_sentences = 30  # max num sentences processed for each article 
max_vocab = 150000
attention_dim = 100
batch_size = 64
words_file = 'data_final/words.pkl'
saved_model = 'models/adam-150-200-100/model.4.hdf5'

In [20]:
with open(words_file, 'rb') as infile:
    words = pickle.load(infile)
word_index = {}
for ix, (word, _) in enumerate(words.most_common(max_vocab)):
    word_index[word] = ix + 1

In [21]:
def create_data_matrix(data, max_sentences=max_sentences, max_words=max_words, max_vocab=max_vocab,
                      word_index=word_index):
    data_matrix = np.zeros((len(data), max_sentences, max_words), dtype='int32')
    for i, article in enumerate(data):
        for j, sentence in enumerate(article):
            if j == max_sentences:
                break
            k = 0
            for word in sentence:
                if k == max_words:
                    break
                ix = word_index.get(word.lower())
                if ix is not None and ix < max_vocab:
                    data_matrix[i, j, k] = ix
                k = k + 1
    return data_matrix

In [22]:
X_test_keras = create_data_matrix(X_test)
y_test_keras = np.asarray(to_categorical(y_test))

In [23]:
class HierarchicalAttentionNetwork(Layer):
    ''''''
    def __init__(self, **kwargs):
        self.init_weights = initializers.get('glorot_normal')
        self.init_bias = initializers.get('zeros')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super().__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init_weights((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init_bias((self.attention_dim,)))
        self.u = K.variable(self.init_weights((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super().build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):        
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
        
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


In [24]:
with CustomObjectScope({'HierarchicalAttentionNetwork': HierarchicalAttentionNetwork}):
            model = load_model(saved_model)

In [26]:
#  Save logistic regression model artifacts and garbage collect to free up memory
with open('models/logistic/counter.pkl', 'wb') as outfile:
    pickle.dump(counter, outfile)
with open('models/logistic/tfidf.pkl', 'wb') as outfile:
    pickle.dump(tfidf, outfile)
with open('models/logistic/lr.pkl', 'wb') as outfile:
    pickle.dump(lr, outfile)

In [27]:
import gc
data = None
data_counts = None
sample_data = None
sample_data_counts = None
X_train = None
X_val = None
gc.collect()

4

In [29]:
model.evaluate(x=X_test_keras, y=y_test_keras, batch_size=batch_size)



[0.22986735509708522, 0.91568749999999999]

In [59]:
score = model.predict(X_test_keras[1:3])

In [61]:
score[:,1]

array([ 0.996813  ,  0.99338996], dtype=float32)

In [63]:
scores = lr.predict_proba(tfidf.transform(X_test_counts[1:3]))

In [65]:
(score[:,1] + scores[:,1])/2

array([ 0.98672044,  0.98276808])