In [1]:
import sys

from keras.models import Sequential
from keras.layers import LSTM, Embedding
from keras.layers import Masking
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import Flatten

import numpy as np
import random
import re
import string

mxlen=20

char_map={"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "A": 27, "B": 28, "C": 29, "D": 30, "E": 31, "F": 32, "G": 33, "H": 34, "I": 35, "J": 36, "K": 37, "L": 38, "M": 39, "N": 40, "O": 41, "P": 42, "Q": 43, "R": 44, "S": 45, "T": 46, "U": 47, "V": 48, "W": 49, "X": 50, "Y": 51, "Z": 52, "0": 53, "1": 54, "2": 55, "3": 56, "4": 57, "5": 58, "6": 59, "7": 60, "8": 61, "9": 62, "_": 63, "UNK":64}


def getTrainingData(hashtags, mxlen):
    trainingData = []
    labels = []

    for i in range(len(hashtags)):
        hashtag = hashtags[i]
        train_hashtag = []
        label = []
        
        for i in range(len(hashtag)-1):
            letter = hashtag[i]
            next_letter = hashtag[i+1]
            if letter != " ":
                if next_letter == " ":
                    label.append(1)
                else:
                    label.append(0)
                train_hashtag.append(letter)
                
        if hashtag != " ":
            train_hashtag.append(hashtag[-1])
            label.append(0)
            
        labels.append(label)
        trainingData.append(train_hashtag[:mxlen])

    return trainingData, labels

# pad input sequence to fixed length
def pad(trainingData, labels, mxlen):
    for i in range(len(trainingData)):
        sample = trainingData[i]
        label = labels[i]

        if len(sample) < mxlen:
            sample += [-1] * (mxlen-len(sample)) 
            label += [-1] * (mxlen-len(label))

        sample=np.array(sample[:mxlen])
        label=np.array(label[:mxlen])

    return np.array(trainingData), labels


def get_data(filename):
    with open(filename, 'r') as f:
        hashtag_data = f.read().split('\n')
        hashtag_data = [h for h in hashtag_data if len(h) > 0]

    originalData, labels = getTrainingData(hashtag_data, mxlen)
    data, labels = pad(originalData, labels, mxlen)

    samples = len(data)
    data = np.asarray(data).reshape((samples*mxlen, 1))
    new_trainingData=[]

    for char in data:
        cc=char[0]
        if cc == "-1":
            val=0
        else:
            if cc in char_map:
                val=char_map[cc]
            else:
                val=char_map["UNK"]

        new_trainingData.append(val)

    labels = np.asarray(labels).reshape((samples, mxlen, 1))
    data = np.array(new_trainingData).reshape(samples, mxlen)

    return originalData, data, labels


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# get data
originalTraining, trainingData, labels = get_data("train.txt")
originalDev, devData, devLabels = get_data("dev.txt")
originalTest, testData, _ = get_data("test.txt")

## 1 Bidirectional LSTM

In [3]:
hidden_neurons = 256
char_dim=20

# define LSTM
model = Sequential()
model.add(Embedding(input_dim=len(char_map)+1, output_dim=char_dim, input_length=mxlen, mask_zero=True))
#add Bidirectional LSTM layer here
model.add(Bidirectional(LSTM(hidden_neurons, return_sequences=True),input_shape=(mxlen, char_dim)))
#add Dense Time Distributed output layer here
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 20)            1300      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 20, 512)           567296    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 1)             513       
Total params: 569,109
Trainable params: 569,109
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
BATCH_SIZE=256
NUM_EPOCHS=2

# Fit model on training data, evaluating on dev data
model.fit(trainingData, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=(devData, devLabels))

Train on 705490 samples, validate on 1282 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0xb1f4a8d68>

In [5]:
#save model
model.save('model_1.h5')

In [3]:
#load model
from keras.models import load_model
model = load_model('model_1.h5')

## 2 Predictions to Segmentations

In [4]:
# convert predictions to segmentation
def segment(input_seq, ys):
    """
    Return the original hashtag and the segmented hashtag
       >>> input_seq = [g, o, b, e, a, r, s, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
       >>> ys = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0]
       >>> segment(input_seq, ys)
       gobears , go bears 
    """
    
    original=[]
    segmentation=[]
    
    ######################
    ### YOUR CODE HERE ###
    ######################
    original = [i for i in input_seq if i !=-1]
    segmentation = [i for i in input_seq if i !=-1]
    ys = ys[:len(original)]
    position = [i for i in range(len(ys)) if ys[i]]
    for pos in reversed(position):
        segmentation.insert(pos+1,' ')
    
    return original, segmentation

In [5]:
#Test_Case 1
input_seq = ['g', 'o', 'b', 'e', 'a', 'r', 's', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
ys = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0]
original, segmentation = segment(input_seq, ys)
print(original)
print(segmentation)

['g', 'o', 'b', 'e', 'a', 'r', 's']
['g', 'o', ' ', 'b', 'e', 'a', 'r', 's']


In [6]:
#Test_Case 2
input_seq = ['T', 'h', 'i', 's', 'i', 's', 'a', 'b', 'i', 'r', 'd', -1, -1, -1, -1, -1, -1, -1, -1, -1]
ys = [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
original, segmentation = segment(input_seq, ys)
print(original)
print(segmentation)

['T', 'h', 'i', 's', 'i', 's', 'a', 'b', 'i', 'r', 'd']
['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 'b', 'i', 'r', 'd']


In [7]:
### Fill in above function segment before running this cell ###
#load model
from keras.models import load_model
model = load_model('model_1.h5')

# Generate predictions for test data written to output.txt
out=open("output.txt", "w")
yhat = model.predict_classes(testData, verbose=0)
idx=0
samples,_,_ = yhat.shape
for batch_num in range(samples):
    vals=[]
    for seq in range(mxlen):
        vals.append(yhat[batch_num][seq][0])
    original, segmentation=segment(originalTest[idx], vals)
    out.write ("%s\t%s\n" % (''.join(original), ''.join(segmentation)))
    idx+=1
out.close()

## 3 Chunking-System Evaluation F-1 Score

In [8]:
#function for segmentation
def segment_func(labels,cur):
    chunks_ids = {}
    for i in range(len(labels)):
        if labels[i]:
            chunks_ids[i] = labels[cur:i+1]
            cur = i+1
        if i == len(labels)-1:
            chunks_ids[i] = labels[cur:mxlen]
    return chunks_ids

In [9]:
import math
def segment_F1_score(pred_labels, true_labels):
    """
    Return average F1 score of segmentations provided by the model
    """
    
    ######################
    ### YOUR CODE HERE ###
    ######################
    
    f_score_list = []
    for sen in range(len(true_labels)):
        true_strim = [i[0] for i in true_labels[sen] if i !=-1]
        pred_strim = [i[0] for i in pred_labels[sen][:len(true_strim)]]
        
        pred_chunks = segment_func(pred_strim,0)
        true_chunks = segment_func(true_strim,0)
    
        correct = 0
        for i in true_chunks:
            if i in pred_chunks:
                if true_chunks[i] == pred_chunks[i]:
                    correct += 1
        precision = correct/len(pred_chunks)
        recall = correct/len(true_chunks)

        f_score_list.append(2*precision*recall/(precision+recall+1e-18))
            
    
    return sum(f_score_list)/len(f_score_list)

In [10]:
#Test_Case 3
ps = [[[0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0]]]
ts = [[[0], [1], [0], [0], [1], [1], [0], [1], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0]]]
print(segment_F1_score(ps, ts))

0.6250000000000001


In [11]:
# make predictions for devData
yhat = model.predict_classes(devData, verbose=0)
segment_F1_score(yhat, devLabels)

0.7251132902458948

## 4 [Extra Credit]  Kaggle & Architecture Exploration

(If you changed the model architecture describe the changes you made here)

In [None]:
# Generate kaggle csv for test data
kaggle_out=open("kaggle_output.csv", "w")
kaggle_out.write("id,expected\n")
yhat = model.predict_classes(testData, verbose=0)
idx=0
samples,_,_ = yhat.shape
for batch_num in range(samples):
    vals=[]
    for seq in range(mxlen):
        vals.append(yhat[batch_num][seq][0])
    original, segmentation=segment(originalTest[idx], vals)
    kaggle_out.write ("%s,%s\n" % (batch_num, ''.join(segmentation)))
    idx+=1
kaggle_out.close()