## Spooky author identification challenge

### This challenge invites kagglers to identify the horror story author from the given text snippets. The challenge is little different from other nlp problems because we need to find the signature of the author from his writing style than simply understanding the context vectors. Hence word vectors may be of little help here. I am using keras embeddings instead of word2vec and combine features from LSTM and CNN(to be able to find pattern translation) and then pass the combined features to another hidden layer in the neural network. 

In [None]:
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import EarlyStopping
from keras import optimizers

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
# binarize the labels for neural net
from keras.utils import np_utils
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [None]:
vocab_size = 10000
xtrain_enc = [one_hot(d, vocab_size) for d in xtrain]
xvalid_enc = [one_hot(d, vocab_size) for d in xvalid]

In [None]:
max_length = 300
padded_docs_train = pad_sequences(xtrain_enc, maxlen=max_length, padding='post')
padded_docs_valid = pad_sequences(xvalid_enc, maxlen=max_length, padding='post')

In [None]:
## Create a functional api for shared features from cnn and lstm

#lstm with cnn
from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())

# lstm features
lstm_encoding = LSTM(100)(x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(x)

# combined features
merged = concatenate([lstm_encoding, cnn_encoding])

#batch_normalized = BatchNormalization()(merged)



hidden1 = Dense(500, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs = main_input, outputs = output)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

## variation 2

In [None]:
## next try adding one more feature using dense layers and merge

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())

# lstm features
lstm_encoding = LSTM(100)(x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(x)

# dense features
# dense feature
dense_mod = Sequential()
dense_mod.add(Dense(300, input_dim=300, activation='relu'))
dense_mod.add(Dropout(0.2))
dense_mod.add(BatchNormalization())

dense_mod.add(Dense(300, activation='relu'))
dense_mod.add(Dropout(0.3))
dense_mod.add(BatchNormalization())
dense_encoding = dense_mod(x)


# combined features
merged = concatenate([lstm_encoding, cnn_encoding, dense_encoding])

#batch_normalized = BatchNormalization()(merged)



hidden1 = Dense(400, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs = main_input, outputs = output)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

## variation 3

### glove vectors creation

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

In [None]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [None]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [None]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(padded_docs_train, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=(padded_docs_valid, yvalid_enc), callbacks=[earlystop])

## variation 4

In [None]:
## use two kinds of encoding the keras word embedding and glove
## use simple architecture for this

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())

#glove-vectors
auxiliary_input = Input(shape=(300,), name='aux_input')

#combine the inputs
comb_x = keras.layers.concatenate([x, auxiliary_input])


#batch_normalized = BatchNormalization()(comb_x)

hidden1 = Dense(500, activation='relu')(comb_x)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs =[main_input,aux_input], outputs = output)


model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit({'main_input':padded_docs_train, 'aux_input':xtrain_glove_scl}, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=([padded_docs_valid,yvalid_glove_scl], yvalid_enc), callbacks=[earlystop])

## variation 5


In [None]:
## combine the above with cnn , lstm and dense layers

## use two kinds of encoding the keras word embedding and glove
## use complex architecture for this

from keras.layers import Input, Embedding, LSTM, Dense, concatenate
from keras.models import Model

main_input = Input(shape=(300,), dtype='int32', name='main_input')

# embedding vectors
x = Embedding(vocab_size, 300, input_length=max_length)(main_input)
#model.add(Flatten())

#glove-vectors
auxiliary_input = Input(shape=(300,), name='aux_input')

#combine the inputs
comb_x = keras.layers.concatenate([x, auxiliary_input])

# lstm features
lstm_encoding = LSTM(100)(comb_x)

# cnn features 
cnn_mod = Sequential()
cnn_mod.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu', input_shape=(300,300)))
cnn_mod.add(Flatten())
cnn_encoding = cnn_mod(comb_x)

# combined features
merged = concatenate([lstm_encoding, cnn_encoding])

#batch_normalized = BatchNormalization()(merged)

hidden1 = Dense(500, activation='relu')(merged)

#hidden2 = Dense(3)(hidden1)
output = Dense(3, activation = 'softmax')(hidden1)


model = Model(inputs =[main_input,aux_input], outputs = output)


model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit({'main_input':padded_docs_train, 'aux_input':xtrain_glove_scl}, y=ytrain_enc, batch_size=64, epochs=100, 
          verbose=1, validation_data=([padded_docs_valid,yvalid_glove_scl], yvalid_enc), callbacks=[earlystop])

# variation 6

In [None]:
# adding dense features as well

