In [1]:
import numpy as np
import keras
import re
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


### Read Data

In [2]:
with open('embeddings-twitter.txt',encoding='utf-8') as f:
    emb_raw= f.read().splitlines()

In [3]:
with open('tweet-pos/tweets-train.txt',encoding='utf-8') as f:
    train = list(map(lambda x: re.split(r'\n|\t', x), f.read().split("\n\n")[:-1]))

with open('tweet-pos/tweets-dev.txt',encoding='utf-8') as f:
    dev = list(map(lambda x: re.split(r'\n|\t', x), f.read().split("\n\n")[:-1]))

with open('tweet-pos/tweets-devtest.txt',encoding='utf-8') as f:
    devtest = list(map(lambda x: re.split(r'\n|\t', x), f.read().split("\n\n")[:-1]))

### Preprocessing

In [4]:
emb = {}
for entry in emb_raw:
    l = entry.split()
    emb[l[0]] = np.array(list(map(float, l[1:])))

In [5]:
unk = emb["UUUNKKK"]

In [6]:
def get_emb(emb, word):
    return emb.get(word, unk)

In [7]:
def preprocessing(dset, w, emb):
    inputs = []
    labels = []
    for s in dset:
        tokens = s[::2]
        n = len(tokens)
        labels += s[1::2]
        embeddings = [get_emb(emb, '<s>')]*w +\
                     [get_emb(emb, token) for token in tokens] +\
                     [get_emb(emb, '</s>')]*w
        for i in range(w, w+n):
            inputs.append(np.concatenate(embeddings[i-w:i+w+1]))
    x = np.array(inputs)
    categories = set(labels)
    return x, labels, categories

In [8]:
train_x, train_labels, train_categories = preprocessing(train, 1, emb)
dev_x, dev_labels, dev_categories = preprocessing(dev, 1, emb)
devtest_x, devtest_labels, devtest_categories = preprocessing(devtest, 1, emb)

In [9]:
n = len(train_categories)

In [10]:
train_categories - dev_categories

{'M', 'Y'}

In [11]:
train_categories - devtest_categories

{'M', 'Y'}

In [12]:
encoder = LabelEncoder()
encoder.fit(train_labels)
train_y = to_categorical(encoder.transform(train_labels))
dev_y = to_categorical(encoder.transform(dev_labels))
devtest_y = to_categorical(encoder.transform(devtest_labels))

### Baseline Tagger (w=1)

In [13]:
callbacks = [EarlyStopping(monitor='val_acc', verbose=1)]

In [14]:
model = Sequential()
model.add(Dense(128, activation='tanh', input_dim=150))
model.add(Dense(n, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 00026: early stopping


<keras.callbacks.History at 0x275c0299fd0>

In [15]:
test_loss, test_acc = model.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8358501118568232


### Vary w

#### w=0 

In [16]:
train_x_w0 = preprocessing(train, 0, emb)[0]
dev_x_w0 = preprocessing(dev, 0, emb)[0]
devtest_x_w0 = preprocessing(devtest, 0, emb)[0]

In [17]:
model_w0 = Sequential()
model_w0.add(Dense(128, activation='tanh', input_dim=50))
model_w0.add(Dense(n, activation='softmax'))
model_w0.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_w0.fit(train_x_w0, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x_w0, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 00020: early stopping


<keras.callbacks.History at 0x275c2cd7c18>

In [18]:
test_loss, test_acc = model_w0.evaluate(devtest_x_w0, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8069071588366891


#### w=2 

In [19]:
train_x_w2 = preprocessing(train, 2, emb)[0]
dev_x_w2 = preprocessing(dev, 2, emb)[0]
devtest_x_w2 = preprocessing(devtest, 2, emb)[0]

In [20]:
model_w2 = Sequential()
model_w2.add(Dense(128, activation='tanh', input_dim=250))
model_w2.add(Dense(n, activation='softmax'))
model_w2.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_w2.fit(train_x_w2, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x_w2, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 00025: early stopping


<keras.callbacks.History at 0x275c516e7f0>

In [21]:
test_loss, test_acc = model_w2.evaluate(devtest_x_w2, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8323545861297539


### Change non-linearity functions

#### identity

In [22]:
model_i = Sequential()
model_i.add(Dense(128, activation='linear', input_dim=150))
model_i.add(Dense(n, activation='softmax'))
model_i.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_i.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 00029: early stopping


<keras.callbacks.History at 0x275c9663da0>

In [23]:
test_loss, test_acc = model_i.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8386465324384788


#### ReLU 

In [24]:
model_relu = Sequential()
model_relu.add(Dense(128, activation='relu', input_dim=150))
model_relu.add(Dense(n, activation='softmax'))
model_relu.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_relu.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 00028: early stopping


<keras.callbacks.History at 0x275c9880080>

In [25]:
test_loss, test_acc = model_relu.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8358501118568232


#### Sigmoid 

In [26]:
model_sig = Sequential()
model_sig.add(Dense(128, activation='sigmoid', input_dim=150))
model_sig.add(Dense(n, activation='softmax'))
model_sig.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_sig.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 00039: early stopping


<keras.callbacks.History at 0x275c9ad3f60>

In [27]:
test_loss, test_acc = model_sig.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.7455257270693513


### Change hidden layers 

#### No hidden layer 

In [28]:
model_l0 = Sequential()
model_l0.add(Dense(n, activation='softmax', input_dim=150))
model_l0.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_l0.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x275c9b000f0>

In [29]:
test_loss, test_acc = model_l0.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.799496644295302


#### 1 Hidden layer with width 256 

In [30]:
model_l1_l = Sequential()
model_l1_l.add(Dense(256, activation='tanh', input_dim=150))
model_l1_l.add(Dense(n, activation='softmax'))
model_l1_l.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_l1_l.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 00022: early stopping


<keras.callbacks.History at 0x275c9ef3f98>

In [31]:
test_loss, test_acc = model_l1_l.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8322147651006712


#### 2 hidden layers with width 256 & 128

In [32]:
model_l2_s = Sequential()
model_l2_s.add(Dense(256, activation='tanh', input_dim=150))
model_l2_s.add(Dense(128, activation='tanh'))
model_l2_s.add(Dense(n, activation='softmax'))
model_l2_s.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_l2_s.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping


<keras.callbacks.History at 0x275ca144a20>

In [33]:
test_loss, test_acc = model_l2_s.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8414429530201343


#### 2 hidden layers with width 512 & 256

In [34]:
model_l2_l = Sequential()
model_l2_l.add(Dense(512, activation='tanh', input_dim=150))
model_l2_l.add(Dense(256, activation='tanh'))
model_l2_l.add(Dense(n, activation='softmax'))
model_l2_l.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model_l2_l.fit(train_x, train_y, epochs=50, callbacks=callbacks, validation_data=(dev_x, dev_y))

Train on 14619 samples, validate on 4823 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping


<keras.callbacks.History at 0x275cb31ff60>

In [35]:
test_loss, test_acc = model_l2_l.evaluate(devtest_x, devtest_y)
print('test_acc:', test_acc)

test_acc: 0.8450782997762863
