In [1]:
# import libraries
import os, zipfile
import numpy as np
import pandas as pd
import nltk

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

### Read in Data

In [2]:
# read in data
data = pd.read_csv('data/en-annotated.tsv', sep='\t', header=None, names=['sentence', 'label_raw'])

# get only the first label for now
# unsure if that's the most "important" one or what to do later
data['label_raw'] = data['label_raw'].str.split(',')
data['label'] = pd.to_numeric(data.label_raw.str[0])-1  ## CNN seems to expect labels to start at 0

# summarize first label
print(data.label.value_counts())
data.head()

0    3721
1    2706
7    2686
4    1845
3    1797
5    1704
2    1640
6    1429
Name: label, dtype: int64


Unnamed: 0,sentence,label_raw,label
0,", ...",[1],0
1,!,"[1, 4, 7]",0
2,... And I don't think we need to discuss the T...,"[8, 1]",7
3,* So get up out of your bed,[1],0
4,A confession that you hired [PERSON] ... and a...,"[1, 6]",0


In [3]:
# percentage of each label
data.label.value_counts(normalize=True) * 100

0    21.228891
1    15.438156
7    15.324053
4    10.526016
3    10.252168
5     9.721588
2     9.356458
6     8.152670
Name: label, dtype: float64

In [4]:
train_in, test_in, train_labels, test_labels = train_test_split(data['sentence'], data['label'], test_size = 0.33)
print('Train dataset shape: ', train_in.shape)
print('Test dataset shape: ', test_in.shape)

Train dataset shape:  (11743,)
Test dataset shape:  (5785,)


In [5]:
# In case we want a Dev set
train_in, dev_in, train_labels, dev_labels = train_test_split(train_in, train_labels, test_size = 0.2)
print('Train dataset shape: ', train_in.shape)
print('Dev dataset shape: ', dev_in.shape)

Train dataset shape:  (9394,)
Dev dataset shape:  (2349,)


### Baseline model - CNN

#### tokenize and embed sentences
- Using Glove

In [6]:
# Process sentences 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tun into tokens
# max len 
max_len = train_in.str.len().max()
if test_in.str.len().max() > max_len: max_len = test_in.str.len().max()
print('max sentence length in train and test =', max_len)

# initialize tokenizer 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_in)

# convert to sequences and pad
train_sequences = tokenizer.texts_to_sequences(train_in)
test_sequences = tokenizer.texts_to_sequences(test_in)
padding_type = "post"
truncate_type = "pre"
# use 100 for now
max_len_touse = 100
train_padded = pad_sequences(train_sequences,maxlen=max_len_touse, padding=padding_type, truncating=truncate_type)
test_padded = pad_sequences(test_sequences,maxlen=max_len_touse, padding=padding_type, truncating=truncate_type)

max sentence length in train and test = 299


In [7]:
# download Glove model
# based on https://cnvrg.io/cnn-sentence-classification/
import wget
if not os.path.isdir("data"):
    os.makedirs("data")
url = "http://nlp.stanford.edu/data/glove.6B.zip"
wget.download(url, out="data")

'data/glove.6B (1).zip'

In [8]:
with zipfile.ZipFile('data/glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('data/glove')

In [9]:
embeddings_index = {}
f = open('data/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [10]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, max_len_touse))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector

#### Train model

- First attempt: based on  https://cnvrg.io/cnn-sentence-classification/
        Did not perform better than most common class (label 0 at 21.2%)... :(

In [11]:
# define kera embedding layer
embedding_layer = keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                            output_dim=max_len_touse,
                            weights=[embedding_matrix],
                            input_length=max_len_touse,
                            trainable=False)

In [12]:
# model - option 1 
model_test = keras.models.Sequential([
    embedding_layer,
  keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.GlobalMaxPooling1D(),
  keras.layers.Dense(10, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])

In [13]:
# train model 
model_test.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model_test.fit(train_padded, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


- Second attempt: based on CCN notebook from assignment 4
        A little bit better!

In [14]:
# model - taken from CNN in A4
epochs = 10
embed_dim = 100
num_filters = [2, 2, 2]
kernel_sizes = [2, 3, 4]
dense_layer_dims = [10, 4]
dropout_rate = 0.7
num_classes = 8

# Input is a special "layer".  It defines a placeholder that will be overwritten by the training data.
# In our case, we are accepting a list of wordids (padded out to max_len).
wordids = keras.layers.Input(shape=(max_len_touse,))

# Embed the wordids.
# Recall, this is just a mathematically equivalent operation to a linear layer and a one-hot
h = keras.layers.Embedding(len(tokenizer.word_index) + 1, embed_dim, input_length=max_len_touse)(wordids)

# Construct "filters" randomly initialized filters with dimension "kernel_size" for each size of filter we want.
# With the default hyperparameters, we construct 2 filters each of size 2, 3, 4.  As in the image above, each filter
# is wide enough to span the whole word embedding (this is why the convolution is "1d" as seen in the
# function name below).
conv_layers_for_all_kernel_sizes = []
for kernel_size, filters in zip(kernel_sizes, num_filters):
    conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(h)
    conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
    conv_layers_for_all_kernel_sizes.append(conv_layer)

# Concat the feature maps from each different size.
h = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)

# Dropout can help with overfitting (improve generalization) by randomly 0-ing different subsets of values
# in the vector.
# See https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf for details.
h = keras.layers.Dropout(rate=dropout_rate)(h)

### YOUR CODE HERE
# Add a fully connected layer for each dense layer dimension in dense_layer_dims.
dense_layers = []
for dense_dim in dense_layer_dims:
    dense_layer = keras.layers.Dense(dense_dim, activation='relu')(h)
    dense_layers.append(dense_layer)
    
h = keras.layers.concatenate(dense_layers, axis=1)

### END YOUR CODE

prediction = keras.layers.Dense(num_classes, activation='softmax')(h)

model = keras.Model(inputs=wordids, outputs=prediction)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # From information theory notebooks.
              metrics=['accuracy'])        # What metric to output as we train.

In [15]:
tf.keras.utils.to_categorical(
    train_labels, num_classes=None, dtype='float32'
)
model.fit(train_padded, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14ab3c430>

In [16]:
model.evaluate(test_padded, test_labels)



[1.9493663311004639, 0.25531548261642456]

In [17]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 100)     632700      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 99, 2)        402         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 98, 2)        602         embedding_1[0][0]                
_______________________________________________________________________________________

## Training with Random Search


In [19]:
# Implement Random Search for parameters: num_filters, kernel_sizes
import random as rn
for _ in range(10):
    # Specify model hyperparameters.
    epochs = 10
    embed_dim = 100
    num_filters = [rn.randint(1, 50), rn.randint(1, 50), rn.randint(1, 50)]
    kernel_sizes = [rn.randint(1, 20), rn.randint(1, 20), rn.randint(1, 20)]
    dense_layer_dims = [10, 4]
    dropout_rate = 0.7
    num_classes = 8
    print('kernel_sizes: ', kernel_sizes)
    print('num_filters: ', num_filters)
    # Construct the convolutional neural network.

    # Input is a special "layer".  It defines a placeholder that will be overwritten by the training data.
    # In our case, we are accepting a list of wordids (padded out to max_len).
    wordids = keras.layers.Input(shape=(max_len_touse,))

    # Embed the wordids.
    # Recall, this is just a mathematically equivalent operation to a linear layer and a one-hot
    h = keras.layers.Embedding(len(tokenizer.word_index) + 1, embed_dim, input_length=max_len_touse)(wordids)

    # Construct "filters" randomly initialized filters with dimension "kernel_size" for each size of filter we want.
    # With the default hyperparameters, we construct 2 filters each of size 2, 3, 4.  As in the image above, each filter
    # is wide enough to span the whole word embedding (this is why the convolution is "1d" as seen in the
    # function name below).
    conv_layers_for_all_kernel_sizes = []
    for kernel_size, filters in zip(kernel_sizes, num_filters):
        conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(h)
        conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
        conv_layers_for_all_kernel_sizes.append(conv_layer)

    # Concat the feature maps from each different size.
    h = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)

    # Dropout can help with overfitting (improve generalization) by randomly 0-ing different subsets of values
    # in the vector.
    # See https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf for details.
    h = keras.layers.Dropout(rate=dropout_rate)(h)

    ### YOUR CODE HERE
    # Add a fully connected layer for each dense layer dimension in dense_layer_dims.
    dense_layers = []
    for dense_dim in dense_layer_dims:
        dense_layer = keras.layers.Dense(dense_dim, activation='relu')(h)
        dense_layers.append(dense_layer)

    h = keras.layers.concatenate(dense_layers, axis=1)

    ### END YOUR CODE

    prediction = keras.layers.Dense(num_classes, activation='softmax')(h)

    model = keras.Model(inputs=wordids, outputs=prediction)
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',  # From information theory notebooks.
                  metrics=['accuracy'])        # What metric to output as we train.
    tf.keras.utils.to_categorical(
        train_labels, num_classes=None, dtype='float32'
    )
    model.fit(train_padded, train_labels, epochs=epochs)
    model.evaluate(test_padded, test_labels)

kernel_sizes:  [5, 17, 2]
num_filters:  [50, 34, 5]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [18, 12, 19]
num_filters:  [36, 44, 14]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [11, 4, 18]
num_filters:  [44, 36, 10]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [12, 3, 10]
num_filters:  [45, 12, 47]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [7, 9, 17]
num_filters:  [43, 2, 13]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [9, 20, 19]
num_filters:  [41, 30, 16]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [13, 5, 2]
num_filters:  [25, 11, 43]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
kernel_sizes:  [11, 15, 10]
num_filters:  [47, 41, 46]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Train an RNN (LSTM)