# Neural Network Text Classification Playground

In this notebook, I'm experimenting with text classification on the [US CFPB](https://www.kaggle.com/cfpb/us-consumer-finance-complaints) dataset. The models I tried are:

- 1-layer LSTM + Dense Layer
- 1-layer LSTM with meta-data features 
- 1-layer LSTM with glove embeddings and meta-data features
- 1-D convnet with glove embeddings
- 2-layer bidirectional LSTM with glove embeddings



In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix


from tensorflow.keras.optimizers import Adam
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from tensorflow.keras.layers import Concatenate, Bidirectional
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
from tensorflow import keras

from tensorflow.keras.layers import Conv1D,MaxPooling1D, GlobalMaxPooling1D

In [2]:
df = pd.read_csv('consumer_complaints.csv')
df.columns = [c.replace(' ','_').lower().replace('-','').replace('?','') for c in df.columns]
relief_tags = ['Closed with non-monetary relief',
               'Closed with monetary relief','Closed with relief',
                              ]
df['relief_received'] = df.company_response_to_consumer.apply(
                                   lambda x: 1 if x in relief_tags else 0)

df_text = df.copy()
df_text.dropna(axis=0,subset=['consumer_complaint_narrative'], inplace=True)
df_text.reset_index(inplace=True)
X = df_text['consumer_complaint_narrative'].apply(lambda x:x.lower())
y = df_text.relief_received
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)


In [3]:

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100

In [13]:


tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

from tensorflow.keras.utils import to_categorical

X_train_text = tokenizer.texts_to_sequences(X_train.values)
X_train_text = pad_sequences(X_train_text, maxlen=MAX_SEQUENCE_LENGTH)

X_test_text = tokenizer.texts_to_sequences(X_test.values)
X_test_text = pad_sequences(X_test_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train_text.shape)

y_binary_train = to_categorical(y_train)
y_binary_test = to_categorical(y_test)



Found 41213 unique tokens.
Shape of data tensor: (39994, 500)


In [14]:
## basic lstm
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train_text.shape[1]))
model.add(LSTM(64, dropout=0.2 ))
# model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))
opt = Adam(learning_rate=3e-4)
model.compile(loss='categorical_crossentropy',
                   optimizer=opt, metrics=['acc'])
print(model.summary())
epochs = 10
batch_size = 256 

history = model.fit(X_train_text, y_binary_train, epochs=epochs,
#                     class_weight={1:2,0:1}, # add some class weight to improve the recall
                    batch_size=batch_size,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          2000000   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 2)                 130       
Total params: 2,042,370
Trainable params: 2,042,370
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [17]:

df_text.columns
meta_columns = ['product','subproduct','issue','subissue','timely_response','consumer_disputed']
meta_one_hot_encoded = pd.get_dummies(df_text[meta_columns])
meta_ordinal_encoded = OrdinalEncoder().fit_transform(df_text[meta_columns].fillna('none'))


In [18]:
meta_ordinal_encoded_train = meta_ordinal_encoded[X_train.index]
meta_ordinal_encoded_test = meta_ordinal_encoded[X_test.index]

In [19]:

nlp_input = Input(name='text',shape=(X_train_text.shape[1],)) 
meta_input = Input(name='meta',shape=(meta_ordinal_encoded_train.shape[1],))

emb = Embedding(output_dim=EMBEDDING_DIM, input_dim=MAX_NB_WORDS, input_length=X_train_text.shape[1])(nlp_input) 
nlp_out = Bidirectional(LSTM(128))(emb) 
concat = keras.layers.concatenate([nlp_out, meta_input]) 
classifier = Dense(32, activation='relu')(concat) 
output = Dense(2, activation='softmax')(classifier) 
model = Model(inputs=[nlp_input , meta_input], outputs=[output])

opt = Adam(learning_rate=3e-4)
model.compile(loss='categorical_crossentropy',
                   optimizer=opt, metrics=['acc'])

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, 500)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 100)     2000000     text[0][0]                       
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 256)          234496      embedding_1[0][0]                
__________________________________________________________________________________________________
meta (InputLayer)               [(None, 6)]          0                                            
______________________________________________________________________________________________

In [21]:

model.fit(
    {"text": X_train_text, "meta": meta_ordinal_encoded_train},
    y_binary_train,
    epochs=epochs,
    batch_size=batch_size,validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x2527513bd30>

In [22]:
print(classification_report(y_test, 
                            model.predict({'text':X_test_text, 
                                           'meta':meta_ordinal_encoded_test}).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90    131030
           1       0.48      0.14      0.22     28946

    accuracy                           0.82    159976
   macro avg       0.66      0.55      0.56    159976
weighted avg       0.77      0.82      0.77    159976



In [4]:
## Glove
import os
import numpy as np
embeddings_index = {}
GLOVE_DIR = r"GLOVE DIRECTORY"

f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [5]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train.values)
sequences = tokenizer.texts_to_sequences(X_train.values)
train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

test_sequences = tokenizer.texts_to_sequences(X_test.values)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 39767 unique tokens.


In [6]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [7]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# embedded_sequences = embedding_layer(nlp_input)

In [4]:
nlp_input = Input(name='text',shape=(None,)) 
meta_input = Input(name='meta',shape=(meta_ordinal_encoded_train.shape[1],))
nlp_out = Bidirectional(LSTM(64))(embedded_sequences) 
concat = keras.layers.concatenate([nlp_out, meta_input]) 
classifier = Dense(32, activation='relu')(concat) 
output = Dense(2, activation='softmax')(classifier) 
model = Model(inputs=[nlp_input , meta_input], outputs=[output])

opt = Adam(learning_rate=3e-4)
model.compile(loss='sparse_categorical_crossentropy',
                   optimizer=opt, metrics=['acc'])

NameError: name 'meta_ordinal_encoded_train' is not defined

In [27]:
epochs=10
batch_size=128
model.fit(
    {"text": train_data, "meta": meta_ordinal_encoded_train},
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    #validation_split=0.1,
#     callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x253e9285070>

In [28]:
print(classification_report(y_test, 
                            model.predict({'text':test_data, 
                                           'meta':meta_ordinal_encoded_test}).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89    131030
           1       0.43      0.24      0.31     28946

    accuracy                           0.81    159976
   macro avg       0.64      0.59      0.60    159976
weighted avg       0.77      0.81      0.78    159976



In [29]:
# 1-D CNN
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = Conv1D(128, 5, activation="relu")(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)
preds = Dense(2, activation="softmax")(x)
model = Model(int_sequences_input, preds)

opt = Adam(learning_rate=3e-4)
model.compile(
    loss="categorical_crossentropy", optimizer=opt, metrics=["acc"]
)

In [30]:
model.fit(train_data, keras.utils.to_categorical(y_train), epochs=epochs,
#                     class_weight={1:5,0:1}, # add some class weight to improve the recall
                    batch_size=150,validation_split=0.1,
#                     callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25455aa7910>

In [31]:
print(classification_report(y_test, 
                            model.predict(test_data).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89    131030
           1       0.40      0.13      0.20     28946

    accuracy                           0.81    159976
   macro avg       0.61      0.54      0.55    159976
weighted avg       0.75      0.81      0.77    159976



In [8]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.75
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [9]:
nlp_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(nlp_input)

nlp_out = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
nlp_out = Bidirectional(LSTM(64))(nlp_out)
# classifier = Dense(32, activation='relu')(nlp_out) 
output = Dense(1, activation='sigmoid')(nlp_out) 
model = Model(nlp_input, output)

opt = Adam(learning_rate=3e-4)
model.compile(loss='binary_crossentropy',
                   optimizer=opt, metrics=['acc'])

In [10]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         3976800   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         84480     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 4,160,225
Trainable params: 183,425
Non-trainable params: 3,976,800
_________________________________________________________________


In [None]:
model.fit(train_data, 
          y_train, 
          epochs=10,
#                     class_weight={1:5,0:1}, # add some class weight to improve the recall
                    batch_size=256,
          validation_split=0.1,
#                     callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
  4/141 [..............................] - ETA: 15s - loss: 0.4824 - acc: 0.8086