In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from multiprocessing.pool import Pool
import re
positive_file = 'data/rt-polarity.pos'
negative_file = 'data/rt-polarity.neg'
data_root = 'data/stanfordSentimentTreebank/'
glove_pattern = 'data/glove.6B.<size>d.txt'
glove_size = 300

In [2]:
glove_dict = None
def get_glove(size = 50):
    global glove_dict
    if glove_dict is None or len(list(glove_dict.values())[0]) != size:
        file = re.sub('<size>', str(size), glove_pattern)
        glove = pd.read_csv(file, sep = " ", header = None, index_col = 0, quoting = 3)
        glove_dict = {key: val.values for key, val in glove.T.items()}
    return glove_dict

def preprocess_sentence(line):
    line = re.sub(r'[^\x00-\x7F]+', '', line.strip())
    return line.strip().lower()

def preprocess_for_labels(line):
    line = preprocess_sentence(line)
    line = re.sub('\W+', '', line.strip())
    return line
    
def get_labelset(file):
    with open(file, "rt", encoding="utf-8") as f:
        sentences = set([preprocess_for_labels(line) for line in f.readlines()])
    return sentences

In [3]:
positive_labelset = get_labelset(positive_file)
negative_labelset = get_labelset(negative_file)

In [4]:
data_split_dict = {}
data_split_map = {'1': 'train', '2': 'test', '3':'val'}
with open(data_root + 'datasetSplit.txt', 'r') as f:
    for line in f.readlines():
        [index, set_code] = line.strip().split(',')
        try:
            index = int(index)
        except:
            continue
        if set_code not in data_split_map:
            print(index, set_code)
        data_split_dict[index] = data_split_map.get(set_code, 'val')

In [5]:
def check_sentiment(sentence):
    s = preprocess_for_labels(sentence)
    for y, labelset in enumerate([negative_labelset, positive_labelset]):
        for entry in labelset:
            if s in labelset:
                return y
    return -1

sentences = {}
word2ind = {}
ttws = tf.keras.preprocessing.text.text_to_word_sequence
n_skipped = 0
idx = 0
with open(data_root + 'datasetSentences.txt') as data:
    lines = data.readlines()
    for line in lines:
        index = re.search('^\d+', line)
        if index is None:
            continue
        index = int(index.group())
        if index not in data_split_dict:
            print(index, line)
        entry = {'split_set': data_split_dict.get(index, 'val')}
        line = preprocess_sentence(line)
        line = re.sub(r'^\d+\s+', '', line)
        sentiment = check_sentiment(line)
        if sentiment >= 0:
            entry['y'] = sentiment
        else:
            n_skipped += 1
            continue
        sentences[line] = entry
        tokens = ttws(line)
        for token in tokens:
            if token not in word2ind:
                word2ind[token] = idx
                idx = idx + 1

In [6]:
print(n_skipped, ' skipped')
print(len(sentences), ' kept')

2775  skipped
9077  kept


In [7]:
data_splits = {title: {'x':[], 'y':[]} for title in ['train', 'test', 'val']}
for line, entry in sentences.items():
    which = entry['split_set']
    tokenized = [int(word2ind[token]) for token in ttws(line)]
    (data_splits[which]['x']).append(tokenized)
    (data_splits[which]['y']).append(int(entry['y']))

In [8]:
for title in ['train', 'test', 'val']:
    print(title + ' dataset size:', len(data_splits[title]['x']))

train dataset size: 6530
test dataset size: 1702
val dataset size: 845


In [9]:
def get_dataset(label, sequence_len):
    vals = data_splits[label]
    x = [np.array(vx) for vx in vals['x']]
    x = tf.keras.preprocessing.sequence.pad_sequences(x,
                                                      padding = 'post',
                                                      maxlen = sequence_len)
    y = np.array(vals['y'])
    return x, y

In [50]:
def get_embedding_matrix(glove_size):
    glove = get_glove(glove_size)
    glove_words = set(glove.keys())
    default_vector = np.mean(list(glove.values()), axis = 0)
    embedding_matrix = np.empty((len(word2ind), default_vector.shape[0]))
    for word, position in word2ind.items():
        embedding_matrix[position,:] = glove.get(word, default_vector)
    return embedding_matrix


### 2.1 Define Network Models

In [None]:
class CNNModel(tf.keras.Model):
    
    def __init__(self, embedding = 300, 
                 train_embeddings = True, 
                 pretrained_embedding = False, 
                 vocab_size = None,
                 filter_windows = [3,4,5],
                 feature_maps = 100,
                 dropout_rate = .5,
                 l2_constraint = 3):
        super(CNNModel, self).__init__()
        if pretrained_embedding:
            embedding_matrix = get_embedding_matrix(embedding)
            embedding_initializer = tf.keras.initializers.Constant(embedding_matrix)
        else:
            embedding_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
        if vocab_size is None:
            vocab_size = len(word2ind)
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, 
            embedding, 
            embeddings_initializer= embedding_initializer,
            trainable = train_embeddings
        )
        self.filters = [tf.keras.layers.Conv1D(feature_maps, fw, activation = 'relu',kernel_constraint=tf.keras.constraints.max_norm(l2_constraint)) for fw in filter_windows]
        self.max_pool = tf.keras.layers.GlobalMaxPool1D()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.concat = tf.keras.layers.Concatenate(axis=1)
        self.logistic = tf.keras.layers.Dense(1, activation = 'sigmoid', 
                                              kernel_constraint=tf.keras.constraints.max_norm(l2_constraint))
        
    def call(self, inputs, training = False):
        x = self.embedding(inputs)
        filter_outputs = []
        for f in self.filters:
            out = f(x)
            filter_outputs.append(self.max_pool(out))
        x = self.concat(filter_outputs)
        if training:
            x = self.dropout(x)
        return self.logistic(x)
            
def compile_model(model = CNNModel, lr = .01, model_args = {}, metrics = ['BinaryAccuracy']):
    net = model(**model_args)
    optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
    net.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = metrics)
    return net

def run_model(model_type = CNNModel, verbose = 2, epochs = 100, batch_size = 50, 
              sequence_len = 45, learning_rate = .01, model_args = {}):
    (xtrain, ytrain)= get_dataset('train', sequence_len)
    (xval, yval) = get_dataset('val', sequence_len)
    model = compile_model(model_type, learning_rate, model_args)
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                   patience=10, 
                                                   restore_best_weights = True)
    train_history = model.fit(xtrain, ytrain, callbacks = [es_callback], validation_data = (xval, yval), batch_size = batch_size, epochs = epochs, verbose = verbose)
    return train_history

### 2.2 Run a basic CNN Model

In [None]:
rand_hist = run_model()

In [92]:
static_hist = run_model(model_args = {'pretrained_embedding': True, 'train_embeddings': False})

Train on 6530 samples, validate on 845 samples
Epoch 1/100
6530/6530 - 7s - loss: 0.6413 - binary_accuracy: 0.6303 - val_loss: 0.5675 - val_binary_accuracy: 0.7290
Epoch 2/100
6530/6530 - 6s - loss: 0.5305 - binary_accuracy: 0.7686 - val_loss: 0.5194 - val_binary_accuracy: 0.7621
Epoch 3/100
6530/6530 - 6s - loss: 0.4699 - binary_accuracy: 0.8087 - val_loss: 0.4936 - val_binary_accuracy: 0.7669
Epoch 4/100
6530/6530 - 6s - loss: 0.4249 - binary_accuracy: 0.8372 - val_loss: 0.4786 - val_binary_accuracy: 0.7728
Epoch 5/100
6530/6530 - 6s - loss: 0.3897 - binary_accuracy: 0.8608 - val_loss: 0.4675 - val_binary_accuracy: 0.7822
Epoch 6/100
6530/6530 - 6s - loss: 0.3600 - binary_accuracy: 0.8798 - val_loss: 0.4647 - val_binary_accuracy: 0.7728
Epoch 7/100
6530/6530 - 6s - loss: 0.3330 - binary_accuracy: 0.8971 - val_loss: 0.4575 - val_binary_accuracy: 0.7811
Epoch 8/100
6530/6530 - 6s - loss: 0.3091 - binary_accuracy: 0.9133 - val_loss: 0.4529 - val_binary_accuracy: 0.7882
Epoch 9/100
6530/

In [93]:
nonstatic_hist = static_hist = run_model(model_args = {'pretrained_embedding': True, 'train_embeddings': True})

Train on 6530 samples, validate on 845 samples
Epoch 1/100
6530/6530 - 15s - loss: 0.6394 - binary_accuracy: 0.6297 - val_loss: 0.5594 - val_binary_accuracy: 0.7515
Epoch 2/100
6530/6530 - 14s - loss: 0.5208 - binary_accuracy: 0.7821 - val_loss: 0.5128 - val_binary_accuracy: 0.7751
Epoch 3/100
6530/6530 - 14s - loss: 0.4508 - binary_accuracy: 0.8250 - val_loss: 0.4911 - val_binary_accuracy: 0.7692
Epoch 4/100
6530/6530 - 14s - loss: 0.3968 - binary_accuracy: 0.8591 - val_loss: 0.4733 - val_binary_accuracy: 0.7799
Epoch 5/100
6530/6530 - 14s - loss: 0.3518 - binary_accuracy: 0.8856 - val_loss: 0.4573 - val_binary_accuracy: 0.7882
Epoch 6/100
6530/6530 - 15s - loss: 0.3117 - binary_accuracy: 0.9107 - val_loss: 0.4484 - val_binary_accuracy: 0.7882
Epoch 7/100
6530/6530 - 14s - loss: 0.2769 - binary_accuracy: 0.9296 - val_loss: 0.4423 - val_binary_accuracy: 0.7941
Epoch 8/100
6530/6530 - 14s - loss: 0.2451 - binary_accuracy: 0.9436 - val_loss: 0.4388 - val_binary_accuracy: 0.7882
Epoch 9/1

KeyboardInterrupt: 

In [18]:
def plot_history(histories, title = None, sub_titles = None, width = 2, plot_metrics = None):
    if plot_metrics is not None:
        plot_metrics += ['val_' + m for m in plot_metrics]
    else:
        plot_metrics = list(histories[0].history.keys())
    histories = [h.history for h in histories]
    n_plots = len(histories)
    options = {'figsize': [20,30]}
    fig, ax = plt.subplots(n_plots, **options)
    if title is not None:
        fig.suptitle(title)
    for i in range(n_plots):
        history = histories[i]
        for key in plot_metrics:
            vals = history[key]
            ax[i].plot(range(len(vals)), vals, label=key)
            if sub_titles is not None:
                ax[i].set_title(sub_titles[i])
        if len(history.keys()) > 1:
            ax[i].legend()
plot_history(vanilla_hists,
             title = 'Training and Validation loss for 1 layer RNN variants', 
             sub_titles = ['Rnn','LSTM','GRU'])
#report the number of itterations that got the best validation accuracy
def best_timestep(history, metric_name = 'val_binary_accuracy', big_good = True):
    values = history.history[metric_name]
    evals = [np.mean( values[np.max([i-1,0]): np.min([i+1,len(values)-1])] ) for i in range(len(values))]
    if not big_good:
        evals = [-i for i in evals]
    best_itter = np.argmax(evals)
    best_val = values[best_itter]
    result = {key: value[best_itter] for key, value in history.history.items() if re.search('val_', key) is not None}
    result['itter'] = best_itter
    return result
[best_timestep(hist) for hist in vanilla_hists]

def get_best_results(history_list, configs):
    best_result_list = [best_timestep(hist) for hist in history_list]
    best_config = configs[ np.argmax([n['val_binary_accuracy'] for n in best_result_list]) ]
    print([n['val_binary_accuracy'] for n in best_result_list])
    return best_config

def lineplot_small_multiples(histories, title = None, sub_title_func = None, width = 2):
    histories = [h.history for h in histories]
    n_plots = len(histories)
    n_rows = int(np.ceil(n_plots/width))
    options = {'figsize': [40/n_rows,30/width]}
    fig, ax = plt.subplots(n_rows, width, **options)
    if title is not None:
        fig.suptitle(title)
    for i in range(n_rows*width):
        x = int(i%width)
        y = int(np.floor(i/width))
        if n_rows > 1 and width > 1:
            axis = ax[y,x]
        elif width > 1:
            axis = ax[x]
        else:
            axis = ax[y]
        if i < n_plots:
            history = histories[i]
            for key, vals in history.items():
                axis.plot(range(len(vals)), vals, label=key)
                if sub_title_func is not None:
                    axis.set_title(sub_title_func(i))
            if len(history.keys()) > 1:
                axis.legend()
        else:
            fig.delaxes(axis)
            
def find_best_model(model_type, seq_len = 50, epochs = 40):
    xtrain, ytrain = get_dataset('train', seq_len)
    xval, yval = get_dataset('val', seq_len)
    best_score = 0
    best_model = None
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                patience=2, 
                                restore_best_weights = True)
    for embedding in [50,100,300]:
        for hidden_states in [[4], [8], [16]]:    
            for train_embeddings in [True, False]:
                model_args = {'hidden_states': hidden_states,
                             'embedding': embedding,
                             'train_embeddings': train_embeddings}
                model = compile_model(model_type, .005, model_args, 
                                      ['BinaryAccuracy','Precision','Recall'])
                hist = model.fit(xtrain, ytrain,
                          validation_data = (xval, yval),
                          callbacks = [es_callback],
                          batch_size = 500,
                          epochs = epochs,
                          verbose = 0)
                score = np.max(hist.history['val_binary_accuracy'])
                if score > best_score:
                    best_score = score
                    best_model = model
                    print(score)
    return model

def evaluate_best_model(model_type):
    model = find_best_model(model_type)
    xtest, ytest = get_dataset('test', 50)
    [eval_loss, eval_acc, eval_precision, eval_recall] = model.evaluate(xtest, ytest)
    f1_score = 2*eval_precision*eval_recall/(eval_precision + eval_recall)
    result = {'precision': eval_precision, 'recall': eval_recall, 'f1_score': f1_score}
    return model.__class__.__name__, result

[{'val_loss': 0.5134511198517839,
  'val_binary_accuracy': 0.76094675,
  'itter': 13},
 {'val_loss': 0.458090587833224,
  'val_binary_accuracy': 0.78934914,
  'itter': 14},
 {'val_loss': 0.4694531284845792,
  'val_binary_accuracy': 0.7727811,
  'itter': 11}]