In [1]:
%load_ext autoreload
%autoreload 2

# Let's Get Started!

### In case you're having trouble with keras / tensorflow issues, enable (top dropdown menu) following cell and run

#### Import some standard stuff

In [2]:
import config
import math
import numpy as np
import os
import pickle
import sys

#### Import some custom stuff

In [3]:
from keras_model_helpers import build_keras_model, use_lstm
from nb_model_helpers import build_nb_model, use_nb
from util import split_data
from vocabulary_processor_helpers import build_vocabulary_processor, _text_normalizer, _tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Process Data

In [4]:
data_file_name = 'case_study_data.csv' # change as desired

In [9]:
label_mapping, total_training_samples, total_validation_samples, total_test_samples = split_data(data_file_name)
train_steps = int(math.ceil(total_training_samples / config.batch_size))
validation_steps = int(math.ceil(total_validation_samples / config.batch_size))

with open('label_mapping_dict.p', 'wb') as f:
    pickle.dump(label_mapping, f)
reverse_label_mapping = dict((x[1], x[0]) for x in label_mapping.items())
with open('reverse_label_mapping_dict.p', 'wb') as f:
    pickle.dump(reverse_label_mapping, f)
    
data_root = 'data/'
validation_data_path = os.path.join(data_root, 'validation_data.tsv')
test_data_path = os.path.join(data_root, 'test_data.tsv')
training_data_path = os.path.join(data_root, 'training_data.tsv')

Processing Data. Please Wait!


# Naive Bayes Model

In [6]:
print("Building a Naive Bayes model")
nb_model = build_nb_model(training_data_path, test_data_path, validation_data_path)
nb_probs, true_classes, nb_preds, ids = use_nb(test_data_path)

Building a Naive Bayes model
Processing Training Data
Processing Validation Data
Processing Test Data
Confusion Matrix for VALIDATION data in data/validation_data.tsv
PREDICTED CLASS ON X-AXIS. TRUE CLASS ON Y-AXIS.
                  bank_service  credit_card  credit_reporting  \
bank_service              1549          252                36   
credit_card                138         2409               171   
credit_reporting            33          244              7110   
debt_collection             26          132               439   
loan                        41           72               186   
money_transfers            138           58                 0   
mortgage                    29           24                45   

                  debt_collection  loan  money_transfers  mortgage  
bank_service                   28    21               18       103  
credit_card                   114    77               12        34  
credit_reporting              448   168                0

# LSTM Model

In [None]:
print("Building an LSTM model")
vocab_processor = build_vocabulary_processor(training_data_path, config.max_len, config.min_word_count_freq)
with open('vocab_processor.p', 'wb') as f:
    pickle.dump(vocab_processor, f)
    
model = build_keras_model(config.batch_size, config.dropout_rate, config.embedding_size, config.max_len, 
                          config.num_epochs, train_steps, validation_steps, vocab_processor, label_mapping,
                          training_data_path, validation_data_path, model_name='saved_keras_model')

lstm_probs, true_classes, lstm_preds, ids = use_lstm(model, test_data_path, vocab_processor, label_mapping)

Building an LSTM model
Model built with vocabulary of size 25810
Epoch 1/2

# Can score new datasets and have redicted sample labels will written to tsv files
### Assumes new dataset is in same format as original dataset (case_study_data.csv)

#### Change 'dataset_that_you_want_to_score' to the name of your dataset file (should contain same columns/headers/labels as case_study_data.csv)

In [None]:
!python use_trained_nb_model.py dataset_that_you_want_score.csv

In [None]:
!python use_trained_lstm_model.py dataset_that_you_want_score.csv