### Data Load

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
train = pd.read_pickle('./train.pkl')

In [4]:
test = pd.read_pickle('./test.pkl')

In [5]:
vectorizer  = TfidfVectorizer(min_df=3)

In [6]:
x_train = vectorizer.fit_transform(train.Content)

In [7]:
x_test = vectorizer.transform(test.Content)

In [8]:
y_train = pd.get_dummies(train.Lable).values

In [9]:
y_test = pd.get_dummies(test.Lable).values

### Feed Forward Network

I use sklearn style wrapper for feed forward neural network. This wrapper allows to modify hyper-parameters quickly. It uses keras + tensorflow

In [10]:
from neural_network import FeedForwardNetwork

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
model = FeedForwardNetwork(batch_size=256, hidden_layers=[100] ,verbosity=2, epochs=10, validation_split=0.1, dropout=0.6, early_stopping=True)

In [12]:
model.fit(X=x_train, y=y_train)

Data size (50000, 123846) -	 Epochs 10 -	 Batch Size 256
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Fit complete in 261.82 seconds


### Testing 

Test the model one finalized

In [13]:
model.model.evaluate(x=x_test, y=y_test)



[0.4355159841846615, 0.8644706653946933]

### Retrain and Pack Model

Retrain on all data and serialize for use in application

In [14]:
import pickle as pk 

In [15]:
data = pd.concat([train, test])

In [16]:
vectorizer  = TfidfVectorizer(min_df=3)

In [17]:
x = vectorizer.fit_transform(data.Content)

In [18]:
y = pd.get_dummies(data.Lable).values

In [19]:
pk.dump(vectorizer, file=open('vectorizer.pkl', 'wb'))

In [20]:
# Same as previous model, but validation split and early stopping removed and epochs set to early stopped epochs
model = FeedForwardNetwork(batch_size=256, hidden_layers=[100] ,verbosity=2, epochs=5, dropout=0.6)

In [21]:
model.fit(X=x, y=y)

Data size (62204, 145845) -	 Epochs 5 -	 Batch Size 256
Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fit complete in 320.43 seconds


In [22]:
model.model.save('ffn.h5')

### Test Case

In [23]:
words = test.Content[50000]

In [24]:
labels = list(pd.get_dummies(train.Lable).columns)

In [25]:
labels

['APPLICATION',
 'BILL',
 'BILL BINDER',
 'BINDER',
 'CANCELLATION NOTICE',
 'CHANGE ENDORSEMENT',
 'DECLARATION',
 'DELETION OF INTEREST',
 'EXPIRATION NOTICE',
 'INTENT TO CANCEL NOTICE',
 'NON-RENEWAL NOTICE',
 'POLICY CHANGE',
 'REINSTATEMENT NOTICE',
 'RETURNED CHECK']

In [26]:
test.Content[50000]

'b53709df4565 41236ef86234 9f43707507d4 0b5273ff6b8d c9b917564931 b2e477f34f2e 5d14dbafa202 6b304aabdcee 1c0513b41e39 8442c7b20ebd fffcd2784f1f 46c88d9303da 26f768da5068 e4549cb26d13 dc32bc450322 5b86a72d0b5c bb00f25a2371 20d53168dbb6 2ecd83eb765f 0b5273ff6b8d c04bc38995f6 5e0ada30950e ed5376972206 aea809460491 26f768da5068 019aef1ca4e2 3d19c156da79 6af770640118 496f0ae3495a c85b3821556b 8397c99c9ce8 4d87510b8078 41b8bc8aa308 2ecd83eb765f 669b3d0100b0 ca89ece6fa29 a9d16358a5a5 b3bfe684f69b db5841f3564b cbfbf8a6dae8 23922552a7b2 1ecc22fada6d 6dde04aab38a 349138fc9d1f 23ecccc83abc 1ab34730c1e0 54709b24b45f 1615e491eaf2 be0d06caf707 1ee4d6725a3c cb70131b7955 b7a0f56f6ce8 bf064c332aa1 2da39f28c11c 48e0478ade50 1015893e384a 0699bac77427 0424f5084ae2 ca80e077feb2 bad6ff5dd7bc 2f39b67a7ff6 1031afa38dba e285504b15ab 58fd01b6676f 6dd785a78dc4 cb16de709496 d3196f699a9d e1ddb3dc3164 c528b5658528 898e63c07798 91b35abb2f4e 54709b24b45f d4d17158df11 7b37ec68bf67 c6611d0016e2 9be24340fc0e 726e5ebd56d

In [27]:
np.max(model.model.predict(vectorizer.transform([words])))

0.9925372

In [28]:
labels[np.argmax(model.model.predict(vectorizer.transform([words])))]

'POLICY CHANGE'