In [7]:
%%time 
from __future__ import division, print_function, absolute_import
import numpy as np
import tensorflow as tf
import tflearn
import wordninja # an elegant way for words extraction
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
import keras.preprocessing.text
from sklearn.metrics import accuracy_score #accuracy calculator 
from sklearn.model_selection import train_test_split
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression




CPU times: user 53 µs, sys: 1e+03 ns, total: 54 µs
Wall time: 57.2 µs


## **The Task**

to learn a Machine Learning model that classifies a given line as belonging to one of the following 12 novels:

- alice_in_wonderland
- dracula
- dubliners
- great_expectations

And more... 

Basically, a classification NLP problem.

### Load files
Possible to load with pandas or numpy packages too.

In [8]:
X = open("xtrain.txt").read().splitlines()
y = open("ytrain.txt").read().splitlines()


In [9]:
len(X), len(y)

(32732, 32732)

- First, **extract validation data**, in order to check how your model predict on unseen data

In [10]:
x_valid = X[31000:]
X = X[:31000]
y_valid = y[31000:]
y = y[:31000]
len(x_valid), len(X), len(y_valid), len(y)

(1732, 31000, 1732, 31000)

Split list into random train and test subsets

In [11]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)


In [12]:
len(X_train),len(X_test),len(y_train),len(y_test)

(24800, 6200, 24800, 6200)

In [13]:
#convert str -> int 
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))
y_valid = list(map(int, y_valid))

## Word extraction 
Using wordninja extracting words from string in the data 

In [15]:
%%time
xtrain_ext = []
for line in X_train:
    temp_list = wordninja.split(line)
    #removing "noisy" words
    temp_list = [x for x in temp_list if len(x) > 2]
    xtrain_ext.append(temp_list)
    
xtest_ext = []
for line in X_test:
    temp_list = wordninja.split(line)
    #removing "noisy" words 
    temp_list = [x for x in temp_list if len(x) > 2]
    xtest_ext.append(temp_list)

xvalid_ext = []
for line in x_valid:
    temp_list = wordninja.split(line)
    #removing "noisy" words
    temp_list = [x for x in temp_list if len(x) > 2]
    xvalid_ext.append(temp_list)

CPU times: user 1min 43s, sys: 805 ms, total: 1min 44s
Wall time: 1min 50s


In [23]:
"before", X_train[1111], "after", xtrain_ext[1111]

('before',
 'andfortwenty-fiveyearsshehadkepthouseshrewdlyforherhusband.hertwoeldestsonswerelaunched.onewasinadraper’sshopinglasgowandtheotherwasclerktoatea-merchantinbelfast.theywere',
 'after',
 ['and',
  'for',
  'twenty',
  'five',
  'years',
  'she',
  'had',
  'kept',
  'house',
  'shrewdly',
  'for',
  'her',
  'husband',
  'her',
  'two',
  'eldest',
  'sons',
  'were',
  'launched',
  'one',
  'was',
  'draper',
  'shop',
  'glasgow',
  'and',
  'the',
  'other',
  'was',
  'clerk',
  'tea',
  'merchant',
  'belfast',
  'they',
  'were'])

## Text Preprocessing 

In [135]:
tk = Tokenizer(num_words=None, char_level = True, lower=True)
tk.fit_on_texts(xtrain_ext)
x_train = tk.texts_to_sequences(xtrain_ext)
x_test = tk.texts_to_sequences(xtest_ext)
x_valid = tk.texts_to_sequences(xvalid_ext)


# Models

I should explain why I've chosen the TFlearn package; the main reason is I **like** it, second TFlearn is more tensorflow related then f.e. Keras or Mxnet. Next, it provides top-level abstraction; it's simple and fairly robust and understandable. Important, using high-level abstraction is more error prone when using low level (pure TensorFlow). 


The one thing which is not yet implemented in TFlearn is Learning Rate Scheduler; I'm even thinking to contribute to the package because it is must want feature for ADAM optimizer. 


Anyway, everything bellow can be easily implemented in Keras or Mxnet. 

### The idea 

For the final solution, I've decided to use two models and then take the average predictions from both. Most likely the result would be even higher from combining more models.   

## Data preparation 



In [194]:
# Data preprocessing
# Find maximum maxlen for padding 
maxlen = len(max(x_test,key=len))
# Sequence padding
trainX = pad_sequences(x_train, maxlen=maxlen, value=0.)
testX = pad_sequences(x_test, maxlen=maxlen, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(y_train, nb_classes=12)
testY = to_categorical(y_test, nb_classes=12)

### 1st Model

For the first model, I've used bidirectional RNN with LSTM cells. 

In [205]:
tf.reset_default_graph() # you have to reset

net = input_data(shape=[None, 40])
net = embedding(net, input_dim=60000, output_dim=256)
net = bidirectional_rnn(net, BasicLSTMCell(256), BasicLSTMCell(256))
net = dropout(net, 0.5) #kick overfitting 
net = fully_connected(net, 12, activation='softmax')
net = regression(net, learning_rate=0.001, optimizer='adam', loss='categorical_crossentropy')

# Training
model_lstm = tflearn.DNN(net)

model_lstm.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=128, n_epoch = 5,
              )

Training Step: 874  | total loss: [1m[32m0.24668[0m[0m | time: 154.996s
| Adam | epoch: 005 | loss: 0.24668 - acc: 0.9362 -- iter: 22272/22320
Training Step: 875  | total loss: [1m[32m0.23942[0m[0m | time: 160.379s
| Adam | epoch: 005 | loss: 0.23942 - acc: 0.9371 | val_loss: 1.11400 - val_acc: 0.7085 -- iter: 22320/22320
--


### 2nd Model

For the second model I've decided to use a bunch of convolution networks. 

In [208]:
tf.reset_default_graph()

network = input_data(shape=[None, 40], name='input')
# Building convolution network
network = tflearn.embedding(network, input_dim=60000, output_dim=200)
branch1 = conv_1d(network, 200, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 200, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 200, 5, padding='valid', activation='relu', regularizer="L2")
branch4 = conv_1d(network, 200, 6, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3, branch4], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 12, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network)
model.fit(trainX, trainY, n_epoch = 7, shuffle=True, 
          validation_set=(testX, testY), show_metric=True, batch_size=32)

Training Step: 5424  | total loss: [1m[32m0.08348[0m[0m | time: 288.334s
| Adam | epoch: 007 | loss: 0.08348 - acc: 0.9801 -- iter: 24768/24800
Training Step: 5425  | total loss: [1m[32m0.08006[0m[0m | time: 297.749s
| Adam | epoch: 007 | loss: 0.08006 - acc: 0.9821 | val_loss: 0.97496 - val_acc: 0.7524 -- iter: 24800/24800
--


In [None]:
xvalid_ext = tk.texts_to_sequences(xvalid_ext)
xvalid_ext = pad_sequences(xvalid_ext, maxlen=40, value=0.)
xvalid_ext[22]

 - Verify the LSTM model:

In [206]:
x_valid_pred = np.argmax(model_lstm.predict(xvalid_ext),axis=1)

In [207]:
accuracy_score(x_valid_pred, y_valid)

0.7176674364896074

 - Verify the Convolution model:

In [209]:
x_valid_pred = np.argmax(model.predict(xvalid_ext),axis=1)

In [210]:
accuracy_score(x_valid_pred, y_valid)

0.75

### Stacking 

In [211]:
stck = (np.array(model_lstm.predict(xvalid_ext)) + np.array(model.predict(xvalid_ext)))/2

In [215]:
accuracy_score(np.argmax(stck,axis=1),y_valid)

0.77598152424942268

## Final predictions

In [226]:
X_test = open("xtest.txt").read().splitlines()
xtest_ext = []
for line in X_test:
    temp_list = wordninja.split(line)
    #removing "noisy" words 
    temp_list = [x for x in temp_list if len(x) > 2]
    xtest_ext.append(temp_list)
x_test = tk.texts_to_sequences(xtest_ext)
testX = pad_sequences(x_test, maxlen=maxlen, value=0.)
stck = (np.array(model_lstm.predict(testX)) + np.array(model.predict(testX)))/2

In [228]:
submit = np.argmax(stck,axis=1)

In [242]:
with open('submit.txt', 'w') as thefile:
    for item in submit:
        thefile.write("%i\n" % item)

# Conclusion:

The score for the validation set (unseed train data) is about 0.78, it can be imporved by using this ways:
- use chars for tokenization;
- use more (bigger) models;
- avoid overfitting;
- try SVM.