### Import all the required libraries

In [209]:
import os
import sys
import json
import time
import logging
import data_helper
import numpy as np
import pandas as pd
import tensorflow as tf
from text_cnn import TextCNN
from tensorflow.contrib import learn
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

### train_cnn - driver function for this respository. 
#### train_cnn takes two agruments
    ### 1. entire data (features and labels)
    ### 2. parameter values for CNN
#### It first calls data_helper class which does the data cleaning and preprocessing. 

In [212]:
def train_cnn(train_file, parameter_file):
    """Step 0: load sentences, labels, and training parameters"""
    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file)

    #for keys,values in label_dict.items():
    #    print(keys)
    #    print(values)
        
    params = json.loads(open(parameter_file).read())
    
    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    print('The maximum length of all sentences: {}'.format(max_document_length))
    print('\n')
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)
    
    """Step 2: split the original dataset into train and test sets"""
    x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

    """Step 3: shuffle the train set and split the train set into train and validation sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_val, y_train, y_val = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

    """Step 4: save the labels into labels.json"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)
    print('x_train: {}, x_val: {}, x_test: {}'.format(len(x_train), len(x_val), len(x_test)))
    print('y_train: {}, y_val: {}, y_test: {}'.format(len(y_train), len(y_val), len(y_test)))
    print('\n')
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=params['embedding_dim'],
                filter_sizes=list(map(int, params['filter_sizes'].split(","))),
                num_filters=params['num_filters'],
                l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp))

            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']}
                _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            # One evaluation step: evaluate the model with one batch
            def val_step(x_batch, y_batch):
                feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0}
                step, loss, acc, num_correct, predictions = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct, cnn.predictions], feed_dict)
                return num_correct, predictions

            # Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                """Step 6.1: evaluate the model with x_val and y_val (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    val_batches = data_helper.batch_iter(list(zip(x_val, y_val)), params['batch_size'], 1)
                    total_val_correct = 0
                    for val_batch in val_batches:
                        x_val_batch, y_val_batch = zip(*val_batch)
                        num_val_correct, preds = val_step(x_val_batch, y_val_batch)
                        total_val_correct += num_val_correct

                    val_accuracy = float(total_val_correct) / len(y_val)
                    print('Accuracy on validation set: {}'.format(val_accuracy))
                    #print('\n')
                    """Step 6.2: save the model if it is the best based on accuracy on val set"""
                    if val_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = val_accuracy, current_step
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        #print('Saved model at {} at step {}'.format(path, best_at_step))
                        print('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step))
                        print('\n')
            print('The training is complete')
            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
            total_test_correct = 0
            test_pred = []
            test_labels = []
            test_data = []
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                num_test_correct, preds = val_step(x_test_batch, y_test_batch)
                total_test_correct += num_test_correct
                test_pred.append(preds)
                test_labels.append(y_test_batch)
                test_data.append(x_test_batch)

            test_accuracy = float(total_test_correct) / len(y_test)
            print('\n')
            print('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))
            print('\n')
            return test_pred, test_labels, test_data

In [223]:
p, y, x_test = train_cnn('data/consumer_complaints.csv.zip', 'parameters.json')

The maximum length of all sentences: 912


x_train: 54112, x_val: 6013, x_test: 6681
y_train: 54112, y_val: 6013, y_test: 6681


Accuracy on validation set: 0.4418759354731415
Best accuracy is 0.4418759354731415 at step 200


Accuracy on validation set: 0.5727590221187427
Best accuracy is 0.5727590221187427 at step 400


Accuracy on validation set: 0.6532512888741061
Best accuracy is 0.6532512888741061 at step 600


Accuracy on validation set: 0.6662231831032762
Best accuracy is 0.6662231831032762 at step 800


Accuracy on validation set: 0.7019790454016298
Best accuracy is 0.7019790454016298 at step 1000


Accuracy on validation set: 0.7144520206219857
Best accuracy is 0.7144520206219857 at step 1200


The training is complete


Accuracy on test set is 0.7374644514294267 based on the best model /Users/zairah@ibm.com/Documents/IBM/ImagingTeam/SampleWork/Data/Flipkart/CNN/trained_model_1513772897/checkpoints/model-1200




In [224]:
labels = ['Bank account or Service', 'Consumer Loan', 'Credit card', 'Credit reporting', \
          'Debt collection', 'Money transfers', 'Mortgage', \
          'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan']
one_hot = np.zeros((len(labels), len(labels)), int)
np.fill_diagonal(one_hot, 1)
label_dict = dict(zip(labels, one_hot))

In [225]:
labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 
values = ['Bank account or Service', 'Consumer Loan', 'Credit card', 'Credit reporting', \
          'Debt collection', 'Money transfers', 'Mortgage', \
          'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan']
label_int_dict = dict(zip(labels, values))

In [226]:
categoriesy = []
for i in y:
    #get actual labels for each epoch
    for x in i:
        categoriesy.append(x)
#len(categoriesy)
caty = []
for line in categoriesy:        
    for key, val in label_dict.items():
        if np.array_equal(line, val):
            caty.append(key)
#len(caty)
#caty

In [227]:
y_df = pd.DataFrame(caty, columns=['y'])
#y_df.head()

In [228]:
def my_funcp(a):
    for key, val in label_int_dict.items():
        if a == key:
            return val
categoriesp = []
for i in p:
    #get actual labels for each epoch
    for x in i:
        categoriesp.append(x)
    
#len(categoriesp)
catp = []
for line in categoriesp:        
    catp.append(label_int_dict.get(line))
#len(catp)

In [229]:
p_df = pd.DataFrame(catp, columns=['p'])
#p_df.head()

In [230]:
final_df = pd.DataFrame()
final_df['actual_label'] = y_df['y']
final_df['predicted_label'] = p_df['p']
final_df.tail(10)

Unnamed: 0,actual_label,predicted_label
6671,Credit reporting,Credit reporting
6672,Debt collection,Debt collection
6673,Debt collection,Debt collection
6674,Debt collection,Mortgage
6675,Credit reporting,Credit reporting
6676,Credit reporting,Credit reporting
6677,Mortgage,Mortgage
6678,Debt collection,Debt collection
6679,Consumer Loan,Debt collection
6680,Credit reporting,Credit reporting
