In [3]:
import pandas as pd
import numpy as np
import re
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Input,Activation,Conv1D,MaxPooling1D,Flatten,Dense,Embedding,LSTM,Merge, Dropout, TimeDistributedDense
from keras.models import Model, Sequential
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import SGD
import evaluate
from keras.regularizers import l2, activity_l2
from nltk.corpus import stopwords

In [136]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 1200
MAX_NB_WORDS = 50000

In [138]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences_train = tokenizer.texts_to_sequences(texts_train)

In [139]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 38025 unique tokens.


In [142]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)

In [143]:
labels = to_categorical(np.asarray(labels_train))
print('Shape of data tensor:', data_train.shape)
print('Shape of label tensor:', labels_train.shape)

('Shape of data tensor:', (10314, 1200))
('Shape of label tensor:', (10314,))


In [144]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.8 * data.shape[0])

In [145]:
embeddings_index = {}
f = open("glove/glove.6B.300d.txt", "r")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [99]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [104]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [168]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(61, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [169]:
model.layers

[<keras.engine.topology.InputLayer at 0x15fbde5d0>,
 <keras.layers.embeddings.Embedding at 0x17f2bba10>,
 <keras.layers.convolutional.Convolution1D at 0x1512e23d0>,
 <keras.layers.pooling.MaxPooling1D at 0x15fbde850>,
 <keras.layers.convolutional.Convolution1D at 0x15fc41a10>,
 <keras.layers.pooling.MaxPooling1D at 0x15fc41e50>,
 <keras.layers.convolutional.Convolution1D at 0x15fc635d0>,
 <keras.layers.pooling.MaxPooling1D at 0x15fc96d10>,
 <keras.layers.core.Flatten at 0x15fe06b10>,
 <keras.layers.core.Dense at 0x15fe23d50>,
 <keras.layers.core.Dense at 0x15fe4ee10>]

In [129]:
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=2, batch_size=128)

Train on 2063 samples, validate on 8251 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x137910490>

In [150]:
pred_y = model.predict(x_val, batch_size=32)

In [153]:
model.predict_classes(x_val, batch_size=32)

AttributeError: 'Model' object has no attribute 'predict_classes'

In [183]:
np.argmax(pred_y[7000])

3

### Load Data and Preprocess

In [142]:
 'i' in stopwords.words('english')

True

In [4]:
def clean_data(text, keep_period = False):
    text = text.lower()
    NER_pat = re.compile("(@[a-z]+)[1-9]+")
    text = NER_pat.sub('\\1', text)

    NUM_pat = re.compile("(\d+)\S*")
    text = NUM_pat.sub('@number', text)

    if keep_period:
        text = re.sub('[^a-zA-Z0-9.@]', ' ', text)
    else:
        text = re.sub('[^a-zA-Z0-9@]', ' ', text)
    word_list = text.split()
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    text = ' '.join(filtered_words)
    return text

In [5]:
train = pd.read_csv('processed_data/train.tsv',delimiter='\t')
val = pd.read_csv('processed_data/val.tsv',delimiter='\t')
test = pd.read_csv('processed_data/test.tsv',delimiter='\t')

In [153]:
train = train[train["essay_set"] != 8]
test = test[test["essay_set"] != 8]
train = train[train["essay_set"] != 7]
test = test[test["essay_set"] != 7]

In [6]:
texts_train = train["essay"].values
texts_train = [clean_data(i).replace('@', '') for i in texts_train]


texts_val = val["essay"].values
texts_val = [clean_data(i).replace('@', '') for i in texts_val]

texts_test = test["essay"].values
texts_test = [clean_data(i).replace('@', '') for i in texts_test]

In [7]:
labels_train = train["domain1_score"].values
labels_val = val["domain1_score"].values
labels_test = test["domain1_score"].values

In [8]:
class w2v_Model(object):
    def __init__(self, path):
        self.path = path
        self.vocab = {}
        with open(self.path, 'r') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.vocab[word] = coefs

        print('Found %s word vectors.' % len(self.vocab))

    def __getitem__(self, word):
        '''
        Return a word vector or subset of word vectors depending on the key.
        If `key` is a word string, returns the word vector for that word if present,
        else throws and error. If `key` is a list of word strings, returns a matrix X
        where
        
            X[i] = word_vector(key[i])

        else throws an error if any of the word strings does not have a vector.

        Example:
        m = Model('vectors.txt'); m.load()
        m['hello'] # returns a single vector of shape (300,)
        m[['hello', 'world']] # returns a matrix with shape (2, 300)
        '''
        
        if isinstance(word, str):
            return self.__safe_get__(word)
        elif isinstance(word, list):
            indices = map(lambda w: self.__safe_get__(w), word)
            return np.array(indices)
        
    def __safe_get__(self, word):
        if word in self.vocab:
            return self.vocab[word]
        else:
            return np.zeros(EMBEDDING_DIM)
    
    def __contains__(self, word):
        return word in self.vocab



In [9]:
w2v = w2v_Model('glove/glove.6B.100d.txt')

Found 400001 word vectors.


## Simple NN Regression

In [24]:
def regress_kappa(y_true, y_pred, target = train):
    #y_pred = y_pred[:,0]
    
    result_y = np.zeros((len(y_pred),3))
    result_y[:, 0 ] = target["essay_set"].values
    result_y[:,1] = target["essay_id"].values
    result_y[:,2] = np.round(y_pred[:,0])
    result_y = result_y.astype('int64')
    
    true_y = np.zeros((len(y_true),3))
    true_y[:, 0 ] = target["essay_set"].values
    true_y[:,1] = target["essay_id"].values
    true_y[:,2] = y_true 
    true_y = true_y.astype('int64')
    
    return evaluate.evaluate(result_y, true_y)

In [25]:
train_x = np.array(map(lambda essay: w2v[essay.split()].mean(axis = 0), texts_train))
train_y = labels_train.astype('float32')
test_x = np.array(map(lambda essay: w2v[essay.split()].mean(axis = 0), texts_test))
test_y = labels_test.astype('float32')

In [20]:
train_x = np.hstack((train_x, to_categorical(train["essay_set"].values)))
test_x = np.hstack((test_x, to_categorical(test["essay_set"].values)))

In [28]:
model = Sequential()
model.add(Dense(300, input_dim=100, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(150, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1, init='normal'))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)


model.compile(loss='mean_squared_error', metrics = ['mean_absolute_error'], optimizer='adam')
model.fit(train_x, train_y, batch_size = 64, nb_epoch = 300 )

Epoch 1/300


InternalError: Dst tensor is not initialized.
	 [[Node: Const_369 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [100,300] values: [0 0 0]...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op u'Const_369', defined at:
  File "/Users/zelongqiu/anaconda/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Users/zelongqiu/anaconda/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/traitlets/config/application.py", line 653, in launch_instance
    app.start()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2827, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-28-ea0e2be225af>", line 10, in <module>
    model.fit(train_x, train_y, batch_size = 64, nb_epoch = 300 )
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/keras/models.py", line 652, in fit
    sample_weight=sample_weight)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/keras/engine/training.py", line 1083, in fit
    self._make_train_function()
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/keras/engine/training.py", line 696, in _make_train_function
    self.total_loss)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/keras/optimizers.py", line 387, in get_updates
    ms = [K.zeros(shape) for shape in shapes]
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 277, in zeros
    return variable(tf.constant_initializer(0., dtype=tf_dtype)(shape),
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py", line 149, in _initializer
    return constant_op.constant(value, dtype=dtype, shape=shape)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 169, in constant
    attrs={"value": tensor_value, "dtype": dtype_value}, name=name).outputs[0]
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/zelongqiu/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InternalError (see above for traceback): Dst tensor is not initialized.
	 [[Node: Const_369 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [100,300] values: [0 0 0]...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]


In [23]:
pred_y = model.predict(test_x)
test["prediction"] = pred_y
test[test["essay_set"] == 2]

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prediction
154,2979,2,Write a persuasive essay to a newspaper reflec...,1,2,1,1.645708
155,2984,2,How @CAPS4 you feel if your favorite book was ...,5,5,5,4.328355
156,2996,2,If the people that are publishing and writing ...,1,1,1,3.305116
157,3029,2,wow thats racist. as i said when i saw the mov...,3,3,3,2.887580
158,3033,2,Why should we be more carefull of what we get ...,4,4,4,3.106670
159,3045,2,"Should books, magazines, music, movies, and ec...",4,4,4,3.428966
160,3099,2,"Twilight, @PERSON1, or even The @CAPS1 @CAPS2 ...",4,4,4,4.173596
161,3113,2,I dont believe books should be takin off of th...,3,3,3,3.787910
162,3123,2,I believe censorship should be used in librari...,4,4,4,3.740323
163,3126,2,Real life is rated @CAPS1 so why pretend it's ...,4,4,4,3.855362


In [24]:
result_y = np.zeros((len(pred_y),3))
result_y[:, 0 ] = test["essay_set"].values
result_y[:,1] = test["essay_id"].values
result_y[:,2] = np.round(pred_y[:,0])
result_y = result_y.astype('int64')

In [25]:
true_y = np.zeros((len(pred_y),3))
true_y[:, 0 ] = test["essay_set"].values
true_y[:,1] = test["essay_id"].values
true_y[:,2] = labels_test
true_y = true_y.astype('int64')

In [26]:
evaluate.evaluate(result_y, true_y)

kappa for each set [ 0.65595079  0.60566842  0.42155585  0.68330306  0.62259739  0.67250027
  0.64822492  0.6039781 ]


0.61912948702497483

In [27]:
regress_kappa(labels_train,model.predict(train_x),train)

kappa for each set [ 0.92041811  0.76767925  0.92922567  0.9326163   0.94218313  0.92450137
  0.98358915  0.9679918 ]


0.93801721836579621

## Simple NN Softmax classification 

In [28]:
import keras.backend as K

In [29]:
def one_hot_kappa(y_true, y_pred, target = train):
    y_pred = np.argmax(y_pred,axis =1)
    y_true = np.argmax(y_true,axis = 1)
    
    result_y = np.zeros((len(y_pred),3))
    result_y[:, 0 ] = target["essay_set"].values
    result_y[:,1] = target["essay_id"].values
    result_y[:,2] = y_pred
    result_y = result_y.astype('int64')
    
    true_y = np.zeros((len(y_true),3))
    true_y[:, 0 ] = target["essay_set"].values
    true_y[:,1] = target["essay_id"].values
    true_y[:,2] = y_true 
    true_y = true_y.astype('int64')
    
    return evaluate.evaluate(result_y, true_y)

In [30]:
train_x = np.array(map(lambda essay: w2v[essay.split()].mean(axis = 0), texts_train))
test_x = np.array(map(lambda essay: w2v[essay.split()].mean(axis = 0), texts_test))

In [211]:
train_x = np.hstack((train_x, to_categorical(train["essay_set"].values)))
test_x = np.hstack((test_x, to_categorical(test["essay_set"].values)))

In [31]:
train_y = to_categorical(labels_train)
test_y = to_categorical(labels_test)

In [45]:
train_y = to_categorical(np.hstack((labels_train, labels_test)))[0:len(labels_train),:]
test_y = to_categorical(np.hstack((labels_train, labels_test)))[len(labels_train):,:]

In [47]:
model = Sequential()
model.add(Dense(300, input_dim = 300, init='normal', activation='relu'))
model.add(Dense(100, init='normal', activation='relu'))
model.add(Dense(100, init='normal', activation='relu'))
model.add(Dense(61, init='normal', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, batch_size = 64, nb_epoch = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x13dbdc650>

In [54]:
model = Sequential()
model.add(Dense(300, input_dim=300, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(150, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(150, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))

model.add(Dense(61, init='normal', activation='softmax'))
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(train_x, train_y, batch_size = 32, nb_epoch = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x13eeefdd0>

In [55]:
pred_y = model.predict_classes(test_x)
test["prediction"] = pred_y
test[test["essay_set"] == 8]



Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prediction
1204,20716,8,A long time ago when I was in third grade I h...,18,16,34,40
1205,20717,8,Softball has to be one of the single most gre...,21,26,46,40
1206,20727,8,"Laugher Laughter is to express delight, fun, ...",16,15,31,40
1207,20735,8,one time when i was skateboarding with my fri...,10,10,20,40
1208,20800,8,"This true story might sound some what cheesy,...",18,15,33,40
1209,20804,8,Laughter is important in my life for many reas...,20,20,40,40
1210,20819,8,LaughterI laugh everyday. I feel a lot of dif...,15,15,30,40
1211,20825,8,In this essay I will be telling you about why...,15,18,33,3
1212,20832,8,Laughter is something that is there when you ...,20,18,36,40
1213,20841,8,"As we understand the benefits of laughter, we...",17,16,33,40


In [56]:
result_y = np.zeros((len(pred_y),3))
result_y[:, 0 ] = test["essay_set"].values
result_y[:,1] = test["essay_id"].values
result_y[:,2] = pred_y
result_y = result_y.astype('int64')

In [57]:
true_y = np.zeros((len(pred_y),3))
true_y[:, 0 ] = test["essay_set"].values
true_y[:,1] = test["essay_id"].values
true_y[:,2] = labels_test
true_y = true_y.astype('int64')

In [58]:
evaluate.evaluate(result_y, true_y)

kappa for each set [ 0.44044698  0.57673174  0.55416614  0.71316647  0.68496903  0.60792848
  0.52000519  0.00146303]


0.53439040024221085

In [None]:
one_hot_kappa(train_y, model.predict(train_x), train)

In [59]:
one_hot_kappa(train_y, model.predict(train_x), train)

kappa for each set [ 0.47483219  0.53685594  0.72348302  0.76873897  0.77758093  0.77304073
  0.58950121  0.30666794]


0.64320712745851172

### tfdif regression

In [364]:
MAX_NB_WORDS = 2500
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_train + texts_test)

In [365]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 28587 unique tokens.


In [366]:
tokenizer.texts_to_sequences(['hello world combinations'])

[[130]]

In [367]:
train_x = tokenizer.texts_to_matrix(texts_train, mode = 'tfidf')
train_y = labels_train.astype('float32')
test_x = tokenizer.texts_to_matrix(texts_test, mode = 'tfidf')
test_y = labels_test.astype('float32')

In [368]:
train_x.shape

(8491, 2500)

In [369]:
model = Sequential()
model.add(Dense(2500, input_dim=MAX_NB_WORDS, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1000, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1000, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1, init='normal'))

sgd = SGD(lr=0.002)


model.compile(loss='mean_squared_error', metrics = ['mean_absolute_error'], optimizer='adam')
model.fit(train_x, train_y, batch_size = 128, nb_epoch = 50 )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1df68cf50>

In [370]:
pred_y = model.predict(test_x)
test["prediction"] = pred_y
test[test["essay_set"] == 2]

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prediction
154,2979,2,Write a persuasive essay to a newspaper reflec...,1,2,1,2.399050
155,2984,2,How @CAPS4 you feel if your favorite book was ...,5,5,5,3.906921
156,2996,2,If the people that are publishing and writing ...,1,1,1,2.718492
157,3029,2,wow thats racist. as i said when i saw the mov...,3,3,3,2.872141
158,3033,2,Why should we be more carefull of what we get ...,4,4,4,3.931227
159,3045,2,"Should books, magazines, music, movies, and ec...",4,4,4,3.931227
160,3099,2,"Twilight, @PERSON1, or even The @CAPS1 @CAPS2 ...",4,4,4,3.935044
161,3113,2,I dont believe books should be takin off of th...,3,3,3,2.765417
162,3123,2,I believe censorship should be used in librari...,4,4,4,2.372568
163,3126,2,Real life is rated @CAPS1 so why pretend it's ...,4,4,4,3.396649


In [371]:
regress_kappa(labels_train,model.predict(train_x),train)

kappa for each set [ 0.9573843   0.91683706  0.96798999  0.98729912  0.98625968  0.98133063]


0.97311439697647051

In [372]:
regress_kappa(labels_test,model.predict(test_x),test)

kappa for each set [ 0.48131304  0.52971314  0.47251242  0.69335093  0.70572218  0.594049  ]


0.58767347881919696

### tfdif regression

In [407]:
MAX_NB_WORDS = 1000
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_train + texts_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 28587 unique tokens.


In [408]:
train_x = tokenizer.texts_to_matrix(texts_train, mode = 'tfidf')
test_x = tokenizer.texts_to_matrix(texts_test, mode = 'tfidf')

In [409]:
train_y = to_categorical(labels_train)
test_y = to_categorical(labels_test)

In [402]:
train_x = np.hstack((train_x, to_categorical(train["essay_set"].values)))
test_x = np.hstack((test_x, to_categorical(test["essay_set"].values)))

In [410]:
model = Sequential()
model.add(Dense(1000, input_dim=MAX_NB_WORDS, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1000, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))
model.add(Dense(1000, init='normal', activation='relu',\
          W_regularizer=l2(0.0001), activity_regularizer=activity_l2(0.0001)))

model.add(Dense(13, init='normal', activation='softmax'))
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, batch_size = 128, nb_epoch = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x218d0ccd0>

In [411]:
pred_y = model.predict_classes(test_x)
test["prediction"] = pred_y
test[test["essay_set"] == 1]



Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prediction
0,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10,10
1,17,1,"Dear Local Newspaper, I belive that computers ...",4,4,8,8
2,46,1,"Dear, Local Newspaper @CAPS1 here to inform yo...",4,4,8,2
3,55,1,"ARE YOU @CAPS1!! Computers are great, they're ...",4,4,8,8
4,61,1,Do you spend all or most of your freetime sitt...,4,4,8,10
5,69,1,Some people think it is a good idea and same d...,3,4,7,8
6,87,1,Computers. One of the much enjoyed pieces of t...,5,5,10,10
7,108,1,I saw in one of the news papers I got in the m...,4,4,8,10
8,127,1,Computers can affect the way people are and ho...,4,5,9,10
9,160,1,"Dear local newspaper, I agree that people are ...",4,4,8,8


In [412]:
one_hot_kappa(train_y, model.predict(train_x), train)

kappa for each set [ 0.8397268   0.72055908  0.98118579  0.97978893  0.88684094  0.97170524]


0.93702781596423268

In [413]:
one_hot_kappa(test_y, model.predict(test_x), test)

kappa for each set [ 0.51761489  0.09874275  0.09022382  0.17894521  0.18033696  0.22126791]


0.22140911534217456

### Sequence Regression

In [10]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 700
EMBEDDING_DIM = 100

In [11]:
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_train + texts_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 35617 unique tokens.


In [12]:
train_x = tokenizer.texts_to_sequences(texts_train)
test_x = tokenizer.texts_to_sequences(texts_test)

In [13]:
train_x = pad_sequences(train_x, padding = 'post', truncating = 'post', maxlen=MAX_SEQUENCE_LENGTH)
test_x = pad_sequences(test_x, padding = 'post',truncating = 'post', maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', train_x.shape)


('Shape of data tensor:', (10314, 700))


In [19]:
inv_map = {v: k for k, v in word_index.iteritems()}

In [20]:
inv_map

{1: 'caps',
 2: 'people',
 3: 'would',
 4: 'computers',
 5: 'time',
 6: 'like',
 7: 'one',
 8: 'person',
 9: 'computer',
 10: 'get',
 11: 'could',
 12: 'also',
 13: 'books',
 14: 'think',
 15: 'building',
 16: 'num',
 17: 'many',
 18: 'things',
 19: 'go',
 20: 'family',
 21: 'book',
 22: 'way',
 23: 'even',
 24: 'author',
 25: 'life',
 26: 'parents',
 27: 'know',
 28: 'make',
 29: 'friends',
 30: 'good',
 31: 'going',
 32: 'offensive',
 33: 'story',
 34: 'want',
 35: 'take',
 36: 'us',
 37: 'read',
 38: 'see',
 39: 'home',
 40: 'day',
 41: 'something',
 42: 'new',
 43: 'said',
 44: 'mood',
 45: 'much',
 46: 'library',
 47: 'got',
 48: 'back',
 49: 'dirigibles',
 50: 'state',
 51: 'use',
 52: 'location',
 53: 'cyclist',
 54: 'another',
 55: 'children',
 56: 'organization',
 57: 'help',
 58: 'find',
 59: 'music',
 60: 'always',
 61: 'around',
 62: 'thing',
 63: 'need',
 64: 'empire',
 65: 'really',
 66: 'say',
 67: 'world',
 68: 'first',
 69: 'kids',
 70: 'right',
 71: 'bad',
 72: 'made'

In [14]:
train_y = to_categorical(np.hstack((labels_train, labels_test)))[0:len(labels_train),:]
test_y = to_categorical(np.hstack((labels_train, labels_test)))[len(labels_train):,:]

In [15]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = w2v[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [21]:
embedding_matrix = np.zeros((5000 + 1, EMBEDDING_DIM))
for i in range(1,5001):
    word = inv_map[i]
    embedding_vector = w2v[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [22]:
embedding_matrix.shape

(5001, 100)

In [23]:
embedding_vecor_length = EMBEDDING_DIM

def fork (model, n=2):
    forks = []
    for i in range(n):
        f = Sequential()
        f.add (model)
        forks.append(f)
    return forks


model = Sequential()

model_left = Sequential()
model_left.add(Embedding(5000 + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model_left.add(LSTM(64, return_sequences=True))

model_right = Sequential()
model_right.add(Embedding(5000 + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model_right.add(LSTM(64, go_backwards=True, return_sequences=True))
model.add( Merge([model_left, model_right], mode='concat'))
model.add(Dropout(0.2))

left, right = fork(model)

left.add(LSTM(64))
right.add(LSTM(64,  go_backwards=True))

model = Sequential()
model.add(Merge([left, right], mode='concat'))
model.add(Dropout(0.2))
model.add(Dense(61, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit([train_x, train_x], train_y, batch_size = 128, nb_epoch= 50)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
sequential_3 (Sequential)        (None, 700, 128)      1084680                                      
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 64)            49408                                        
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 64)            49408                                        
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 128)           0           merge_2[0][0]                    
___________________________________________________________________________________________

KeyboardInterrupt: 

In [103]:
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(61, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_x, train_y, batch_size = 128, nb_epoch= 3)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_18 (Embedding)         (None, 500, 300)      10730100    embedding_input_18[0][0]         
____________________________________________________________________________________________________
lstm_19 (LSTM)                   (None, 100)           160400      embedding_18[0][0]               
____________________________________________________________________________________________________
dropout_10 (Dropout)             (None, 100)           0           lstm_19[0][0]                    
____________________________________________________________________________________________________
dense_31 (Dense)                 (None, 61)            6161        dropout_10[0][0]                 
Total params: 10896661
____________________________________________________________________

<keras.callbacks.History at 0x1cb3066d0>

In [359]:
pred_y = model.predict_classes([test_x, test_x])




In [360]:
test["prediction"] = pred_y
test[test["essay_set"] == 1]

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prediction
0,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10,12
1,17,1,"Dear Local Newspaper, I belive that computers ...",4,4,8,8
2,46,1,"Dear, Local Newspaper @CAPS1 here to inform yo...",4,4,8,7
3,55,1,"ARE YOU @CAPS1!! Computers are great, they're ...",4,4,8,8
4,61,1,Do you spend all or most of your freetime sitt...,4,4,8,8
5,69,1,Some people think it is a good idea and same d...,3,4,7,8
6,87,1,Computers. One of the much enjoyed pieces of t...,5,5,10,9
7,108,1,I saw in one of the news papers I got in the m...,4,4,8,8
8,127,1,Computers can affect the way people are and ho...,4,5,9,9
9,160,1,"Dear local newspaper, I agree that people are ...",4,4,8,8


In [357]:
one_hot_kappa(train_y, model.predict([train_x, train_x]), train)

kappa for each set [ 0.93582272  0.98486352  0.9729714   0.9837001   0.98912857  0.98136051
  0.93398956  0.82689709]


0.96812604250544088

In [358]:
one_hot_kappa(test_y, model.predict([test_x, test_x]), test)

kappa for each set [ 0.49066455  0.00757581  0.10762456  0.77021246  0.30211683  0.45438115
  0.51410443  0.30636055]


0.39695948628924149

In [361]:
def generate_output(data_df, prediction, name):
	data_df["prediction"] = prediction
	data_df = data_df[["essay_set", 'essay_id', 'prediction', 'domain1_score']]
	data_df.to_csv(name, index = False, header = False)

In [362]:
generate_output(test, model.predict_classes([test_x, test_x]), 'Bidirectional_lstm_test.csv')



In [364]:
generate_output(train, model.predict_classes([train_x, train_x]), 'Bidirectional_lstm_train.csv')



## Sequence Regression

In [339]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 400
EMBEDDING_DIM = 100

In [159]:
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_train + texts_test)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 35617 unique tokens.


In [160]:
train_x = tokenizer.texts_to_sequences(texts_train)
test_x = tokenizer.texts_to_sequences(texts_test)

In [320]:
len(train_x[0])

344

In [161]:
train_x = pad_sequences(train_x, padding = 'post',maxlen=MAX_SEQUENCE_LENGTH)
test_x = pad_sequences(test_x, padding = 'post', maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', train_x.shape)

('Shape of data tensor:', (10314, 700))


In [343]:
train_y = labels_train.astype('float32')
test_y = labels_test.astype('float32')

In [344]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = w2v[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [347]:
embedding_vecor_length = EMBEDDING_DIM

def fork (model, n=2):
    forks = []
    for i in range(n):
        f = Sequential()
        f.add (model)
        forks.append(f)
    return forks


model = Sequential()

model_left = Sequential()
model_left.add(Embedding(len(word_index) + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model_left.add(LSTM(64))

model_right = Sequential()
model_right.add(Embedding(len(word_index) + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model_right.add(LSTM(64, go_backwards=True))

model.add( Merge([model_left, model_right], mode='concat'))
model.add(Dropout(0.2))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, init='normal'))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)


model.compile(loss='mean_squared_error', metrics = ['mean_absolute_error'], optimizer='adam')
print(model.summary())
model.fit([train_x, train_x], train_y, batch_size = 128, nb_epoch= 30)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_73 (Embedding)         (None, 400, 100)      3576700                                      
____________________________________________________________________________________________________
lstm_99 (LSTM)                   (None, 64)            42240                                        
____________________________________________________________________________________________________
embedding_74 (Embedding)         (None, 400, 100)      3576700                                      
____________________________________________________________________________________________________
lstm_100 (LSTM)                  (None, 64)            42240                                        
___________________________________________________________________________________________

<keras.callbacks.History at 0x3004877d0>

In [None]:
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_vecor_length, \
                    weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(13, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
model.fit(train_x, train_y, batch_size = 128, nb_epoch= 3)

In [None]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(61, activation='softmax')(x)

In [348]:
regress_kappa(train_y, model.predict([train_x, train_x]), train)

kappa for each set [ 0.85608191  0.62302006  0.69364329  0.62774886  0.77346977  0.71338742
  0.95053852  0.93725111]


0.81256575262230601

In [349]:
regress_kappa(test_y, model.predict([test_x, test_x]), test)

kappa for each set [ 0.72805651  0.14926855  0.62803101  0.63152203  0.63041704  0.65642458
  0.62133442  0.37867837]


0.57262662140342091