## Spooky author identification challenge

### This challenge invites kagglers to identify the horror story author from the given text snippets. The challenge is little different from other nlp problems because we need to find the signature of the author from his writing style than simply understanding the context vectors. 

### Hence word vectors may be of little help here. I am using keras embeddings instead of word2vec and combine features from LSTM and CNN(to be able to find pattern translation) and then pass the combined features to another hidden layer in the neural network. 

In [108]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed
from keras.layers import Flatten, Reshape
from keras.layers.embeddings import Embedding
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import EarlyStopping
from keras import optimizers
from keras.backend import flatten

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [5]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [7]:
# binarize the labels for neural net
from keras.utils import np_utils
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [8]:
vocab_size = 10000
xtrain_enc = [one_hot(d, vocab_size) for d in xtrain]
xvalid_enc = [one_hot(d, vocab_size) for d in xvalid]

In [9]:
max_length = 300
padded_docs_train = pad_sequences(xtrain_enc, maxlen=max_length, padding='post')
padded_docs_valid = pad_sequences(xvalid_enc, maxlen=max_length, padding='post')

In [10]:
## Create a functional api for shared features from cnn and lstm

#lstm with cnn
from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())

# lstm features
lstm_encoding = LSTM(100)(x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(x)

# combined features
merged = concatenate([lstm_encoding, cnn_encoding])

#batch_normalized = BatchNormalization()(merged)



hidden1 = Dense(300, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs = main_input, outputs = output)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     3000000     main_input[0][0]                 
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)          160400      embedding_1[0][0]                
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 9600)         48032       embedding_1[0][0]                
__________________________________________________________________________________________________
concatenat

In [11]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<keras.callbacks.History at 0x7fb11095ef10>

## variation 2

In [93]:
## next try adding one more feature using dense layers and merge

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())
x1 = flatten(x)

# lstm features
# lstm_encoding = LSTM(100, return_sequences=False)(x)
lstm_encoding = LSTM(100)(x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(x)

# dense features
dense_mod = Sequential()
dense_mod.add(TimeDistributed(Dense(100),input_shape=(300,300)))
dense_mod.add(Flatten())
dense_mod.add(Dropout(0.2))
dense_mod.add(BatchNormalization())

dense_mod.add(Activation('relu'))
#dense_mod.add(Dropout(0.3))
# dense_mod.add(BatchNormalization())


dense_encoding = dense_mod(x)
#dense_encoding1 = flatten(dense_encoding)

# combined features
merged = concatenate([lstm_encoding, cnn_encoding, dense_encoding])

#batch_normalized = BatchNormalization()(merged)



hidden1 = Dense(400, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs = main_input, outputs = output)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_74 (Embedding)        (None, 300, 300)     3000000     main_input[0][0]                 
__________________________________________________________________________________________________
lstm_71 (LSTM)                  (None, 100)          160400      embedding_74[0][0]               
__________________________________________________________________________________________________
sequential_134 (Sequential)     (None, 9600)         48032       embedding_74[0][0]               
__________________________________________________________________________________________________
sequential

In [94]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<keras.callbacks.History at 0x7fb0ad711e50>

## variation 3

### glove vectors creation

In [100]:
# http://www-nlp.stanford.edu/data/glove.840B.300d.zip
# load the GloVe vectors in a dictionary:

embeddings_index = {}
#f = open('glove.840B.300d.txt')
f = open('glove.6B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [00:27, 14633.70it/s]

Found 400000 word vectors.





In [104]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [109]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 17621/17621 [00:06<00:00, 2542.54it/s]
100%|██████████| 1958/1958 [00:00<00:00, 2563.76it/s]


In [110]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [111]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [114]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_110 (Dense)            (None, 300)               90300     
_________________________________________________________________
dropout_58 (Dropout)         (None, 300)               0         
_________________________________________________________________
batch_normalization_77 (Batc (None, 300)               1200      
_________________________________________________________________
dense_111 (Dense)            (None, 300)               90300     
_________________________________________________________________
dropout_59 (Dropout)         (None, 300)               0         
_________________________________________________________________
batch_normalization_78 (Batc (None, 300)               1200      
_________________________________________________________________
dense_112 (Dense)            (None, 3)                 903       
__________

In [113]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


<keras.callbacks.History at 0x7fb09ad27390>

## variation 4

In [127]:
## use two kinds of encoding the keras word embedding and glove
## use simple architecture for this

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
x1= Flatten()(x)
#model.add(Flatten())
#x1 = flatten(x)

#glove-vectors
auxiliary_input = Input(shape=(300,), name='auxiliary_input')

#combine the inputs
comb_x = concatenate([x1, auxiliary_input])


#batch_normalized = BatchNormalization()(comb_x)

hidden1 = Dense(500, activation='relu')(comb_x)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs =[main_input,auxiliary_input], outputs = output)


model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_85 (Embedding)        (None, 300, 300)     3000000     main_input[0][0]                 
__________________________________________________________________________________________________
flatten_76 (Flatten)            (None, 90000)        0           embedding_85[0][0]               
__________________________________________________________________________________________________
auxiliary_input (InputLayer)    (None, 300)          0                                            
__________________________________________________________________________________________________
concatenat

In [128]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit({'main_input':padded_docs_train, 'auxiliary_input':xtrain_glove_scl}, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=([padded_docs_valid,xvalid_glove_scl], yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


<keras.callbacks.History at 0x7fb099748ed0>

## variation 5


In [163]:
## combine the above with cnn , lstm and dense layers

## use two kinds of encoding the keras word embedding and glove
## use complex architecture for this

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
x1= Flatten()(x)
#glove-vectors
auxiliary_input = Input(shape=(300,), name='auxiliary_input')

#combine the inputs
comb_x = concatenate([x1, auxiliary_input])

In [166]:
# lstm features
lstm_encoding = LSTM(100,return_sequences = True)(comb_x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(comb_x)


# dense features
dense_mod = Sequential()
dense_mod.add(TimeDistributed(Dense(100),input_shape=(300,300)))
dense_mod.add(Flatten())
dense_mod.add(Dropout(0.2))
dense_mod.add(BatchNormalization())

dense_mod.add(Activation('relu'))
#dense_mod.add(Dropout(0.3))
# dense_mod.add(BatchNormalization())
dense_encoding = dense_mod(comb_x)


# combined features
merged = concatenate([lstm_encoding, dense_encoding])

#batch_normalized = BatchNormalization()(merged)

hidden1 = Dense(200, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs =[main_input,aux_input], outputs = output)


model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

ValueError: Input 0 is incompatible with layer lstm_93: expected ndim=3, found ndim=2

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit({'main_input':padded_docs_train, 'auxiliary_input':xtrain_glove_scl}, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=([padded_docs_valid,xvalid_glove_scl], yvalid_enc), callbacks=[earlystop])

# variation 6

In [95]:
# adding dense features as well
# we have already seen this is not going to improve our scores
