In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
import spacy
nlp = spacy.load('en')

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

In [4]:
max_len = max([len(i.split()) for i in train.text.values])
max_len

861

In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [6]:
lbl_encoder = preprocessing.LabelEncoder()
y = lbl_encoder.fit_transform(train.author.values)

In [7]:
x_train,x_valid,y_train,y_valid = train_test_split(train.text.values,y,
                                                       test_size=0.5,shuffle=True)

In [12]:
x_train_fltr = [[token.text for token in nlp(doc) if not token.is_stop 
                 and token.text not in ['.',',',':']]for doc in x_train]
x_valid_fltr = [[token.text for token in nlp(doc) if not token.is_stop
                 and token.text not in ['.',',',':']]for doc in x_valid]


In [None]:
x_train_fltr

In [36]:
x_train_fltr_join = [' '.join(doc) for doc in x_train_fltr]
x_valid_fltr_join = [' '.join(doc) for doc in x_valid_fltr]

In [37]:
x_train_fltr_join

['You careful thing downright improviso air',
 'In manner I fancied I distinguish precise moment fairly disengaged lid I determine removed altogether deposited lower berth room ; point I knew example certain slight taps lid striking wooden edges berth endeavored lay gently room floor',
 'So White Ship sailed past walls Thalarion followed days southward flying bird glossy plumage matched sky appeared',
 'God I dare tell I saw',
 'Oh beloved father Indeed miserable words truly I forgive entirely possess heart I endeavoured rainbow gleams cataract D soften thy tremendous sorrows',
 'At I beheld seething blur luminosity ; shapes infinitely distant began detach confusion I saw Juan Romero ?',
 'I wished shake thought feeling I learned means overcome sensation pain death state I feared understand',
 'To accomplish great desideratum ærial navigation generally supposed exceedingly complicated application unusually profound principle dynamics',
 'It Greenwich Greenwich roof row houses lovely gr

In [38]:
xtrain = np.array(x_train_fltr_join)
xvalid = np.array(x_valid_fltr_join)

In [39]:
xtrain.shape,xvalid.shape,y_train.shape,y_valid.shape

((9789,), (9790,), (9789,), (9790,))

In [8]:
from spacy.vectors import Vectors
import spacy

nlp =  spacy.load('en')
#nlp.vocab.vectors.from_glove('data/glove.840B.300d.txt')

In [None]:
xtrain_token=[]
for i in x_train:
    doc = nlp(i)
    xtrain_token.append([token.text for token in doc ])

In [None]:
xvalid_token=[]
for i in x_valid:
    doc = nlp(i)
    xvalid_token.append([token.text for token in doc ])


In [24]:
loc = "data/"
import bcolz
import pickle
import sys   
import pdb
import os
from tensorflow.contrib.learn.python import preprocessing

In [40]:
p = preprocessing.text.VocabularyProcessor(max_document_length=500)
xtrain_ids = np.array(list(p.fit_transform(x_train_fltr_join)))
xvalid_ids = np.array(list(p.fit_transform(x_valid_fltr_join)))

In [41]:
xtrain_ids.shape,xvalid_ids.shape

((9789, 500), (9790, 500))

In [43]:
xtrain_ids_v[np.where(xtrain_ids_v > 21000)].size

0

In [8]:
vocab_size = 21000

In [42]:
xtrain_ids_v = np.array([[i if i<vocab_size else vocab_size-1 for i in s] for s in xtrain_ids])
xvalid_ids_v = np.array([[i if i<vocab_size else vocab_size-1 for i in s] for s in xvalid_ids])

In [19]:
vocab_size = 10000
seq_len = 500

### Writing the input data to a pickle object

In [4]:
import pickle

In [None]:
file1 = open("predict_author_input.pkl","wb")

pickle.dump([xtrain_ids_v,xvalid_ids_v,ytrain_enc,yvalid_enc],file1)

In [5]:
xt,xv,yt,yv = pickle.load(open("predict_author_input.pkl","rb"))

## Embedding matrix

In [None]:
from spacy.vectors import  Vectors


## CNN/LSTM Network for classifying the Author

In [45]:

# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(y_train)
yvalid_enc = np_utils.to_categorical(y_valid)

In [18]:
from sklearn import preprocessing as skp
scl = skp.StandardScaler()
xtrain_ids_scl = scl.fit_transform(xtrain_ids_v)
xvalid_ids_scl = scl.transform(xvalid_ids_v)



In [6]:
from keras.layers import  Conv1D

In [61]:
model = Sequential([
    Embedding(vocab_size, 100, input_length=seq_len),
    Dropout(0.25),
    Conv1D(padding="same", kernel_size=3, filters=64, activation="relu"),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    #Dropout(0.2),
    Dense(3, activation='softmax')])

In [62]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [63]:
model.optimizer.lr=0.1
model.fit(xt,yt,batch_size=128,epochs=5,
          validation_data=(xv,yv))
model.save_weights('model1_weights.h5')

Train on 9789 samples, validate on 9790 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [37]:
model_bn = Sequential([
    Embedding(vocab_size, 300, input_length=seq_len,  
             trainable=True),
    BatchNormalization(),
    Dropout(0.25),
    Conv1D(padding="same", kernel_size=5, filters=64, activation="relu"),
    BatchNormalization(),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(3, activation='softmax')])
model_bn.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [38]:
model_bn.fit(xtrain_ids_v,ytrain_enc,batch_size=256,epochs=10,
          validation_data=(xvalid_ids_v,yvalid_enc))

Train on 9789 samples, validate on 9790 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffafe77d438>

In [16]:
model_l = Sequential()
model_l.add(Embedding(vocab_size,
                     300,
                     input_length=seq_len,
                     trainable=True))
model_l.add(SpatialDropout1D(0.3))
model_l.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model_l.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model_l.add(Dense(1024, activation='relu'))
model_l.add(Dropout(0.8))

model_l.add(Dense(1024, activation='relu'))
model_l.add(Dropout(0.8))

model_l.add(Dense(3))
model_l.add(Activation('softmax'))
model_l.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model_l.fit(xtrain_ids_v, y=ytrain_enc, batch_size=512, epochs=10, 
          verbose=1, validation_data=(xvalid_ids_v, yvalid_enc), callbacks=[earlystop])

Train on 9789 samples, validate on 9790 samples
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[512,500,300]
	 [[Node: gru_4/zeros_like = ZerosLike[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](gru_3/transpose_1)]]
	 [[Node: loss_1/mul/_179 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3975_loss_1/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'gru_4/zeros_like', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/surya/DL/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/surya/DL/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/surya/DL/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/surya/DL/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/surya/DL/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/surya/DL/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/surya/DL/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/surya/DL/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/surya/DL/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-c72c557640a6>", line 8, in <module>
    model_l.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))
  File "/home/surya/DL/lib/python3.5/site-packages/keras/models.py", line 475, in add
    output_tensor = layer(self.outputs[0])
  File "/home/surya/DL/lib/python3.5/site-packages/keras/layers/recurrent.py", line 483, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/keras/engine/topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/surya/DL/lib/python3.5/site-packages/keras/layers/recurrent.py", line 1508, in call
    initial_state=initial_state)
  File "/home/surya/DL/lib/python3.5/site-packages/keras/layers/recurrent.py", line 541, in call
    initial_state = self.get_initial_state(inputs)
  File "/home/surya/DL/lib/python3.5/site-packages/keras/layers/recurrent.py", line 469, in get_initial_state
    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
  File "/home/surya/DL/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 753, in zeros_like
    return tf.zeros_like(x, dtype=dtype, name=name)
  File "/home/surya/DL/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1495, in zeros_like
    return gen_array_ops._zeros_like(tensor, name=name)
  File "/home/surya/DL/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 5960, in _zeros_like
    "ZerosLike", x=x, name=name)
  File "/home/surya/DL/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/surya/DL/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/surya/DL/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[512,500,300]
	 [[Node: gru_4/zeros_like = ZerosLike[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](gru_3/transpose_1)]]
	 [[Node: loss_1/mul/_179 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3975_loss_1/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [50]:
# A simple bidirectional LSTM with glove embeddings and two dense layers
model2 = Sequential()
model2.add(Embedding(vocab_size,
                     64,
                     input_length=seq_len,
                     trainable=True))
model2.add(SpatialDropout1D(0.3))
model2.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))

model2.add(Dense(1024, activation='relu'))
model2.add(Dropout(0.8))

model2.add(Dense(1024, activation='relu'))
model2.add(Dropout(0.8))

model2.add(Dense(3))
model2.add(Activation('softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model2.fit(xt, y=yt, batch_size=512, epochs=10, 
          verbose=1, validation_data=(xv, yv), callbacks=[earlystop])

Train on 9789 samples, validate on 9790 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbcb2575ac8>

In [51]:
model2.optimizer.lr=0.1
model2.fit(xt, y=yt, batch_size=512, epochs=3, 
          verbose=1, validation_data=(xv, yv), callbacks=[earlystop])

Train on 9789 samples, validate on 9790 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbcb221bd30>