In [1]:
from __future__ import print_function

import os
import sys
import numpy as np

import pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Lambda
from keras.layers import Embedding
from keras.models import Model, Sequential
import tensorflow as tf

Using TensorFlow backend.


In [None]:
pd.read_csv()

In [3]:
data = pd.read_csv("train.csv")

In [4]:
data.shape

(45824, 3)

In [5]:
data.head()

Unnamed: 0,fileid,converse,categories
0,10555,with doctor hull,APPOINTMENTS
1,27403,m for parent to cb to schedule rov with doctor...,APPOINTMENTS
2,30000,to schedule ctt please patients mp from other ...,APPOINTMENTS
3,41264,Rx refill request patient prescription refill ...,PRESCRIPTION
4,7785,headaches work phone name other mom advises ch...,ASK_A_DOCTOR


In [6]:
data.dtypes

fileid         int64
converse      object
categories    object
dtype: object

In [7]:
data.drop('fileid', axis=1, inplace=True)
# Remove fileid column as this is not required
#fileid = data.pop('fileid')

In [8]:
data.head()

Unnamed: 0,converse,categories
0,with doctor hull,APPOINTMENTS
1,m for parent to cb to schedule rov with doctor...,APPOINTMENTS
2,to schedule ctt please patients mp from other ...,APPOINTMENTS
3,Rx refill request patient prescription refill ...,PRESCRIPTION
4,headaches work phone name other mom advises ch...,ASK_A_DOCTOR


In [94]:
len(data.categories.unique())

6

In [9]:
# Let us fix up the target as categories to start with
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [10]:
# Converting the categories to label
data.categories = le.fit_transform(data.categories)

In [11]:
data.head()

Unnamed: 0,converse,categories
0,with doctor hull,0
1,m for parent to cb to schedule rov with doctor...,0
2,to schedule ctt please patients mp from other ...,0
3,Rx refill request patient prescription refill ...,5
4,headaches work phone name other mom advises ch...,1


In [12]:
labels = data.pop('categories')
X = data.as_matrix()

In [13]:
X.shape

(45824, 1)

In [14]:
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 39288
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

In [15]:
X = np.array(X).flatten()

In [16]:
X=X.astype(str)

In [17]:
X.shape

(45824,)

In [18]:
X[2]

'to schedule ctt please patients mp from other clinic name wpp reason for call details to schedule ctt please patients mp rna follow scheduled'

In [19]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
#tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [20]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


Found 34730 unique tokens.


In [24]:
labels.unique()

array([0, 5, 1, 4, 3, 2], dtype=int64)

In [25]:
# making the train and validation datasets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)

In [26]:
indices

array([42189, 31800, 16796, ..., 44290, 20536, 14227])

In [27]:
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [28]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples] # This is for categories as target

In [29]:
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:] # This is for categories as target

In [30]:
x_train.shape[1]

100

In [31]:
type(y_train)

pandas.core.series.Series

In [32]:
from  keras.utils import to_categorical

In [33]:
y_train = to_categorical(y_train, num_classes=6)

In [34]:
y_val = to_categorical(y_val, num_classes=6)

In [35]:
type(y_train)

numpy.ndarray

In [36]:
y_train

array([[ 0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.]])

In [42]:
# The above model is clearly overfitting. So lets add some dropout and do batch norm
from keras.layers.normalization import BatchNormalization
from keras.layers import Activation

In [163]:
def baseline_model():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= data.shape[1] ))
    model.add(Flatten())
    model.add(Dropout(0.2))

    # hidden Layer 1
    model.add(Dense(100))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    # hidden Layer 2
    model.add(Dense(50))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(6, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
    return model

In [164]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')


In [166]:
model = baseline_model()

<function __main__.baseline_model>

In [152]:
model.fit(data, y_train,
          batch_size=64,
          epochs=2,
          
          callbacks=[reduce_lr, early_stop])

Train on 36660 samples, validate on 9164 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1930deda7f0>

In [153]:
y_val2 = model.predict(x_val)

In [155]:
y_val2.shape

(9164, 6)

In [157]:
from keras.wrappers.scikit_learn import KerasClassifier

In [167]:
estimator = KerasClassifier(build_fn=baseline_model_nn)

Train on 36660 samples, validate on 9164 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1930fd17b38>

In [None]:
estimator.fit(data, y_train1,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

In [169]:
predictions = estimator.predict(x_val)
print(predictions)
print(le.inverse_transform(predictions))

[5 1 5 ..., 1 0 1]
['PRESCRIPTION' 'ASK_A_DOCTOR' 'PRESCRIPTION' ..., 'ASK_A_DOCTOR'
 'APPOINTMENTS' 'ASK_A_DOCTOR']


## Using LSTM instead of MLP
### Hence forward we will be using plain embeddings instead of glove word2vec as it has not given great performance ( Suspect words like Rx etc., are being omitted from glove vocabulary)

In [170]:
from keras.layers.recurrent import LSTM

In [171]:
def baseline_model():
    model = Sequential()
    # num_words is the number of unique words
    # Embedding dimension is the dimension of the hidden layer that we choose --> embedding vector ( we can choose this to be 50, or 100 or so)
    # input length is the fixed length of the sentence that we feed to this embedding layer network ( https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work)
    # Here we mentioned trainable = false, because we are directly using the weights from embedding matrix from glove. 
    #model.add(embedding_layer)
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= data.shape[1] ))
    #model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
    #model.add(Flatten())
    model.add(LSTM(256))
    model.add(Dense(6, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    return model

In [172]:
model = baseline_model()

In [173]:
from keras import callbacks

In [174]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')

In [175]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=1,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 36660 samples, validate on 9164 samples
Epoch 1/1


<keras.callbacks.History at 0x19317be0dd8>

In [49]:
from keras.wrappers.scikit_learn import KerasClassifier

In [179]:
estimator = KerasClassifier(build_fn=baseline_model)

In [180]:
estimator.fit(x_train, y_train1,
          batch_size=64,
          epochs=2,
          verbose=1,
          validation_data=(x_val, y_val1),
          callbacks=[reduce_lr, early_stop])

Train on 36660 samples, validate on 9164 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1931c539358>

In [181]:
predictions = estimator.predict(x_val)
print(predictions)
print(le.inverse_transform(predictions))

[5 4 5 ..., 5 0 1]
['PRESCRIPTION' 'MISCELLANEOUS' 'PRESCRIPTION' ..., 'PRESCRIPTION'
 'APPOINTMENTS' 'ASK_A_DOCTOR']


### Using a Convolution net

In [37]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization

filters = 256
kernel_size = 3
hidden_dims = 100

In [38]:
from keras.optimizers import Adam

In [46]:
custom_adam = Adam(lr = 0.0001, decay = 0.00001)

In [47]:
def base_line_model():
    
    model = Sequential()

    # model.add(embedding_layer) --> This uses Glove
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= x_train.shape[1] )) # --> This is plain embeddings

    #model.add(Flatten()) # Not reqiured here (its giving an error if used)

    model.add(Dropout(0.2))

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     #activation='relu',
                     strides=1))
    # we use max pooling:
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))


    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(6, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
              optimizer=custom_adam,
              metrics=['acc'])
    return model

In [43]:
model = base_line_model()

In [44]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')


In [45]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=3,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 36660 samples, validate on 9164 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1beaaf2cf98>

In [198]:
model.predict(x_val)

array([[  3.66134266e-03,   3.92133445e-02,   4.75013202e-08,
          1.46609801e-03,   2.51798369e-02,   9.30479348e-01],
       [  1.13498703e-01,   1.29671365e-01,   1.01358026e-04,
          1.96862090e-02,   7.26207614e-01,   1.08347209e-02],
       [  2.10267818e-03,   7.72299349e-01,   2.08049800e-10,
          1.11063931e-03,   6.72703655e-03,   2.17760295e-01],
       ..., 
       [  1.81089400e-03,   2.94739246e-01,   2.41806769e-10,
          9.70876252e-04,   4.01671510e-03,   6.98462248e-01],
       [  2.67218739e-01,   5.04393578e-01,   1.36649460e-05,
          1.40445139e-02,   1.69805482e-01,   4.45241034e-02],
       [  8.02200194e-03,   6.59326553e-01,   6.50577263e-08,
          7.10078282e-03,   3.06657422e-02,   2.94884801e-01]], dtype=float32)

In [None]:
y_val

In [51]:
estimator = KerasClassifier(build_fn=base_line_model)

In [74]:
estimator.fit(x_train, y_train,
          batch_size=64,
          epochs=10,
          verbose=1,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 36660 samples, validate on 9164 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bebdf70c18>

In [75]:
predictions = estimator.predict(x_val)
print(predictions)
print(le.inverse_transform(predictions))

[3 1 5 ..., 5 5 5]
['LAB' 'ASK_A_DOCTOR' 'PRESCRIPTION' ..., 'PRESCRIPTION' 'PRESCRIPTION'
 'PRESCRIPTION']


# predicting the test data using conv network

In [54]:
test_data = pd.read_csv("test.csv")

In [55]:
test_data.shape

(11456, 2)

In [56]:
test_data.head()

Unnamed: 0,fileid,converse
0,18766,medfusion secure electronic message subject ra...
1,25550,left msg on home to schedule yrly pm rhonda pe...
2,43519,speak w express scripts re medicines issues pr...
3,50915,gilenya Rx from pharmacy name reason for call ...
4,25917,Rx decadron patient appointments patients requ...


In [57]:
fileId = test_data.pop('fileid')

In [58]:
test_X = test_data.as_matrix()

In [59]:
test_X.shape

(11456, 1)

In [60]:
test_X = np.array(test_X).flatten()

In [61]:
test_X=test_X.astype(str)

In [62]:
#tokenizer.fit_on_texts(test_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

In [63]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

tes_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 34730 unique tokens.


In [76]:
predictions = estimator.predict(tes_data)
print(predictions)
print(le.inverse_transform(predictions))
predictions_le = le.inverse_transform(predictions)

[4 4 5 ..., 4 0 1]
['MISCELLANEOUS' 'MISCELLANEOUS' 'PRESCRIPTION' ..., 'MISCELLANEOUS'
 'APPOINTMENTS' 'ASK_A_DOCTOR']


In [77]:
pred_final = pd.DataFrame(np.column_stack((fileId,predictions_le)),columns=['fileid','categories'])

In [78]:
pred_final.describe()

Unnamed: 0,fileid,categories
count,11456,11456
unique,11456,5
top,49150,PRESCRIPTION
freq,1,3348


In [79]:
pred_final.to_csv("submissions_3.csv")