# Load modules

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pickle import load

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load data

In [0]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# load the model
model = load_model('model.h5')

# Parameters configuration
MAX_SEQUENCE_LENGTH = 250

# Text preprocessing

In [4]:
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

stop_words = stopwords.words('english')

def text_prepocessing(text):
  # noise removal  
  text = re.sub(r'[xX]', '', text) # remove 'x'
  
  # normalization
  text = text.lower() # convert to lowercase text
  text = re.sub(r'\-',' ', text) # seperate words like 'video-related'
  text = re.sub(r'[-+]?\d*\.?\d+', ' NUMBER ', text) # replace numbers with "NUMBER"
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuation  
  
  text = ' '.join(word for word in text.split() if word not in stop_words) # remove stop words
  text = ' '.join(lemmatize_verbs(text.split())) # Lemmatization
  
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Predict

In [5]:
# input
new_complaint = ["I have an open and current mortgage with Chase Bank # XXXX. Chase is reporting the loan payments to XXXX but XXXX is surpressing the information and reporting the loan as Discharged in BK. This mortgage was reaffirmed in a Chapter XXXX BK discharged dated XXXX/XXXX/2013. Chase keeps referring to BK Law for Chapter XXXX and we keep providing documentation for Chapter XXXX, and the account should be open and current with all the payments. "]
# text preprocessing
processed = [text_prepocessing(new_complaint[0])]
# tokenization and padding
seq = tokenizer.texts_to_sequences(processed)
seq_padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
# prediction
y_predict = model.predict(seq_padded)
labels = ['Bank account or service', 
          'Consumer Loan', 
          'Credit card', 
          'Credit reporting', 
          'Debt collection', 
          'Money transfers', 
          'Mortgage',
          'Other financial service',
          'Payday loan',
          'Prepaid card',
          'Student loan']
print("The product that the complaint is about: ", labels[np.argmax(y_predict)])

The product that the complaint is about:  Mortgage
