In [1]:
#importing all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import Adam, Adagrad
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelBinarizer
from gensim.models import KeyedVectors
from sklearn.preprocessing import LabelEncoder
import pickle
import random

In [2]:
random.seed(666)

Loading the dataset

In [3]:
df = pd.read_csv("new_complaints3.csv") 

The data splited into feature and target.

In [4]:
X_data = df['new']
y_data = df['Product']

In [5]:
X_data

0         transworld systems inc \nis trying to collect ...
1         Over the past 2 weeks I have been receiving ex...
2         Pioneer has committed several federal violatio...
3         Previously on XX XX XXXX XX XX XXXX and XX XX ...
4         Hello This complaint is against the three cred...
                                ...                        
597869    1  Mailing Address is incorrect \n2  Date of B...
597870    I made a purchase of ##### on XXXX XXXX #### u...
597871    On XXXX XXXX #### I contacted XXXX XXXX who is...
597872    I can not get from chase who services my mortg...
597873    cfbp i would Like to file a complaint on Exper...
Name: new, Length: 597874, dtype: object

The DataFrame converted to a NumPy array.

In [6]:
X = X_data.values
y = y_data.values

The data splitted into train and test set, the train set is 80 percent of the data, and the test set is 20 percent. The data is stratified based on the label.

In [7]:
X_train_value, X_val_value, y_train_value, y_val_value = train_test_split(X, y, 
                                                                          test_size=0.4, 
                                                                          random_state=666, 
                                                                          stratify=y)

Vocabulary was built based on the unique words from the train set.

In [8]:
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(X_train_value)

In [9]:
vocab = tokenizer.word_index

In [10]:
vocab_size = len(vocab)

In [11]:
vocab_size

159487

Creating an empty weight matrix for words in document vocabulary, +1 because the word index dictionary value starting from 1

In [12]:
embedding_matrix = np.zeros((vocab_size+1, 300))

Fill in the embedding matrix

In [13]:
vectors = KeyedVectors.load('vectors.kv')

In [14]:
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = vectors[word]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_vector = vectors['UNK']
        embedding_matrix[i] = embedding_vector

In [15]:
X_train = tokenizer.texts_to_sequences(X_train_value)
X_val = tokenizer.texts_to_sequences(X_val_value)

Sanity check

In [16]:
len(X_val_value[1].split())

106

In [17]:
X_val_value[7]

'I previously tried to settle this matter with experian about unauthorized inquiries on my consumer report i sent them a letter explaining the problem trying to get experian to resolve the issue but they never did On XX XX #### i sent out another letter certified about the unauthorized inquiries on my consumer report and i still have not got a response I requested there method of verification about these inquires and i never received anything from experian i will send a copy of the certified letter that i sent to experian on XX XX ####'

In [18]:
len(X_val[1])

106

The sequences of words padded and limited to 200 sequences per instances.

In [19]:
X_train = pad_sequences(X_train, maxlen=200, truncating='post', padding='post')
X_val = pad_sequences(X_val, maxlen=200, truncating='post', padding='post')

The label converted to one-hot encoding

In [20]:
enc = LabelEncoder()
y_train_label = enc.fit_transform(y_train_value)
y_val_label = enc.transform(y_val_value)

In [21]:
print(enc.classes_)
print(np.unique(y_train_label, return_counts=True))
print(np.unique(y_val_label, return_counts=True))

['Checking or savings account' 'Credit card or prepaid card'
 'Credit or consumer reporting, credit repair services' 'Debt collection'
 'Money transfer or service, virtual currency' 'Mortgage' 'Personal loan'
 'Student loan' 'Vehicle loan or lease']
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([ 15516,  38824, 145461,  74649,   7690,  42251,  11619,  16220,
         6494], dtype=int64))
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([10344, 25882, 96974, 49767,  5126, 28168,  7746, 10813,  4330],
      dtype=int64))


In [22]:
y_train = to_categorical(np.asarray(y_train_label))
y_val = to_categorical(np.asarray(y_val_label))
print('Shape of data tensor:', X_train.shape)
print('Shape of label tensor:', y_train.shape)
print('Shape of label tensor:', y_val.shape)

Shape of data tensor: (358724, 200)
Shape of label tensor: (358724, 9)
Shape of label tensor: (239150, 9)


Split the data further for validation and test

In [23]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=666, stratify=y_val)

In [24]:
print('training/validation/test')
print(str(X_train.shape[0]) + '/' + str(X_val.shape[0]) + '/' + str(X_test.shape[0]))


training/validation/test
358724/119575/119575


In [25]:
from keras.layers import Embedding, LSTM, Dropout, Conv1D, MaxPool1D, Flatten
from keras.callbacks import EarlyStopping

In [26]:
input_dim = vocab_size + 1
output_dim = 300
input_length = 200

In [24]:
earlystopping = EarlyStopping(monitor="val_loss",
                              mode="min", patience=5,
                              restore_best_weights=True) 

## LSTM

In [26]:
rnn = Sequential()
rnn.add(Embedding(input_dim=input_dim,
                            output_dim=output_dim,
                            input_length=input_length,
                            trainable=True))
rnn.add(LSTM(128))
rnn.add(Dropout(0.2))
rnn.add(Dense(9,activation='softmax'))
optimizer = Adam(lr=0.001)
rnn.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['acc'])

In [27]:
rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 300)          47846400  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 9)                 1161      
Total params: 48,067,209
Trainable params: 48,067,209
Non-trainable params: 0
_________________________________________________________________


In [28]:
batch_size = 50
history5 = rnn.fit(X_train, y_train, epochs=50, batch_size=batch_size, verbose=1,  
                    validation_data=(X_val, y_val),callbacks =[earlystopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


## CNN

In [54]:
earlystopping3 = EarlyStopping(monitor="val_loss",
                              mode="min", patience=5,
                              restore_best_weights=True) 

In [56]:
#input_length=200
cnn2 = Sequential()
cnn2.add(Embedding(input_dim=input_dim,
                            output_dim=output_dim,
                            input_length=input_length,
                            trainable=True))
cnn2.add(Conv1D(128, 5, activation='relu'))
cnn2.add(Dropout(0.2))
cnn2.add(MaxPool1D(5))
cnn2.add(Flatten())
cnn2.add(Dense(9,activation='softmax'))
optimizer = Adam(lr=0.001)
cnn2.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['acc'])

In [57]:
cnn2.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 200, 300)          47846400  
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 196, 128)          192128    
_________________________________________________________________
dropout_7 (Dropout)          (None, 196, 128)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 4992)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 9)                 44937     
Total params: 48,083,465
Trainable params: 48,083,465
Non-trainable params: 0
__________________________________________

In [58]:
batch_size = 50
history7 = cnn2.fit(X_train, y_train, epochs=50, batch_size=batch_size, verbose=1,  
                    validation_data=(X_val, y_val),callbacks =[earlystopping3])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


## LSTM W2V

In [81]:
earlystopping4 = EarlyStopping(monitor="val_loss",
                              mode="min", patience=5,
                              restore_best_weights=True) 

In [83]:
#input_length=200
rnn_w2v = Sequential()
rnn_w2v.add(Embedding(input_dim=input_dim,
                      output_dim=output_dim,
                      weights=[embedding_matrix],
                      input_length=input_length,
                      trainable=False))
rnn_w2v.add(LSTM(128))
rnn_w2v.add(Dropout(0.2))
rnn_w2v.add(Dense(9,activation='softmax'))
optimizer = Adam(lr=0.001)
rnn_w2v.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['acc'])

In [84]:
rnn_w2v.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 200, 300)          47846400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 1161      
Total params: 48,067,209
Trainable params: 220,809
Non-trainable params: 47,846,400
_________________________________________________________________


In [85]:
batch_size = 50
history8 = rnn_w2v.fit(X_train, y_train, epochs=50, batch_size=batch_size, verbose=1,  
                    validation_data=(X_val, y_val),callbacks =[earlystopping4])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


## CNN W2V

In [27]:
earlystopping7 = EarlyStopping(monitor="val_loss",
                              mode="min", patience=5,
                              restore_best_weights=True) 

In [28]:
#input_length=200
cnn_w2v2 = Sequential()
cnn_w2v2.add(Embedding(input_dim=input_dim,
                      output_dim=output_dim,
                      weights=[embedding_matrix],
                      input_length=input_length,
                      trainable=False))
cnn_w2v2.add(Conv1D(128, 5, activation='relu'))
cnn_w2v2.add(Dropout(0.2))
cnn_w2v2.add(MaxPool1D(5))
cnn_w2v2.add(Flatten())
cnn_w2v2.add(Dense(9,activation='softmax'))
optimizer = Adam(lr=0.001)
cnn_w2v2.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['acc'])

In [29]:
cnn_w2v2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 300)          47846400  
_________________________________________________________________
conv1d (Conv1D)              (None, 196, 128)          192128    
_________________________________________________________________
dropout (Dropout)            (None, 196, 128)          0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 39, 128)           0         
_________________________________________________________________
flatten (Flatten)            (None, 4992)              0         
_________________________________________________________________
dense (Dense)                (None, 9)                 44937     
Total params: 48,083,465
Trainable params: 237,065
Non-trainable params: 47,846,400
______________________________________

In [30]:
batch_size = 50
history11 = cnn_w2v2.fit(X_train, y_train, epochs=50, batch_size=batch_size, verbose=1,  
                    validation_data=(X_val, y_val),callbacks =[earlystopping7])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [31]:
earlystopping6 = EarlyStopping(monitor="val_loss",
                              mode="min", patience=5,
                              restore_best_weights=True) 

In [44]:
def save_model(history, model, history_file, model_json, model_weight):
    with open(history_file, 'wb') as file_pi:
        pickle.dump(history.history, file_pi)
    # serialize model to JSON
    model_to_json = model.to_json()
    with open(model_json, "w") as json_file:
        json_file.write(model_to_json)
    # serialize weights to HDF5
    model.save_weights(model_weight)
    print("model saved")


In [115]:
def save_hist(hist, json_name, csv_name):
    # convert the history.history dict to a pandas DataFrame:     
    hist_df = pd.DataFrame(hist.history) 

    # save to json:  
    hist_json_file = json_name  
    with open(hist_json_file, mode='w') as f:
        hist_df.to_json(f)

    # or save to csv: 
    hist_csv_file = csv_name 
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)

In [117]:
save_hist(history9, 'history9.json', 'history9.csv')

In [47]:
save_model(history11, cnn_w2v2,'trainHistoryDict_model_4_cnn_w2v2',
          'model_4_cnn_w2v2.json', 'model_4_cnn_w2v2.h5')

model saved
