In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
import joblib
import gensim.downloader as api
import warnings
warnings.filterwarnings("ignore")

# Reading Sample data
data = pd.read_csv("/kaggle/input/transaction-complaints/complaints-2021-09-08_07_12.csv", usecols=["Product", "Sub-product", "Issue", "Sub-issue", "Consumer complaint narrative"])
data.head(5)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,Debt collection,Credit card debt,Attempts to collect debt not owed,Debt was result of identity theft,
1,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,
2,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,
3,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Investigation took more than 30 days,
4,Debt collection,Other debt,Attempts to collect debt not owed,Debt was paid,


In [2]:
#Droping the rows those don't have complaint narrative
df = data.dropna(subset=['Consumer complaint narrative'])
df.head(5)

Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
41,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,Experian has allowed 6 inquiries on my credit ...
44,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Difficulty submitting a dispute or getting inf...,I submitted documentation from XXXX that I sub...
121,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I contacted XXXX XXXX to inquire as to why a t...
122,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,IC Systems Re : Dispute of Account No. XXXX XX...
153,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Personal information incorrect,This is the 3 time I have disputed and filed a...


In [3]:
#Filtering the rows based on Unique values that have at leat 100 frequencies in a particular column.

def filter_rows(df, condition1, condition2, condition3, condition4):
    filtered_df = df[(df['Product'].isin(condition1)&df['Sub-product'].isin(condition2)&df['Issue'].isin(condition3)&df['Sub-issue'].isin(condition4))]
    return filtered_df

#Creating dictionary that have Unique values and at leat 100 frequencies in a particular column.
dictionary = {"Product":[], "Sub-product":[], "Issue":[], "Sub-issue":[]}
for col in dictionary.keys():
    for val in df[col].unique():
        if df[col].value_counts()[val] >= 100:
            dictionary[col].append(val)
            
data = filter_rows(df, dictionary["Product"], dictionary["Sub-product"], dictionary["Issue"], dictionary["Sub-issue"])
data

Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
41,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,Experian has allowed 6 inquiries on my credit ...
121,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I contacted XXXX XXXX to inquire as to why a t...
122,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,IC Systems Re : Dispute of Account No. XXXX XX...
156,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,MY XXXX XXXX ACCOUNT HAS INCORECT BALANCE INFO...
202,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,Two medical bills showed up on my credit. \nXX...
...,...,...,...,...,...
49623,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,TransUnion is reporting an unverified Chapter ...
49633,Debt collection,Medical debt,Attempts to collect debt not owed,Debt was result of identity theft,I believe I am a victim of identity theft. Due...
49638,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,On XX/XX/XXXX i check my credit through XXXX X...
49796,Debt collection,Credit card debt,Attempts to collect debt not owed,Debt was result of identity theft,RE : Attentively review my formal writ compose...


In [4]:
#Cleaning the text
def clean(text, stem_words=True):
    import re
    from string import punctuation

    def pad_str(s):
        return ' '+s+' '

    if pd.isnull(text):
        return ''

    if type(text) != str or text=='':
        return ''

    text = re.sub("\'s", " ", text) 
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are", text)
    text = re.sub("\'d", " would", text)
    text = re.sub("\'ll", " will", text)
    text = re.sub("e-mail", "email", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub('\$', " dollar", text)
    text = re.sub('\%', " percent", text)
    text = re.sub('\&', "and", text)
    text = re.sub(r'\b(?:x{4,}[\w ]*|xxxx xxxx)\b', "something", text, flags=re.IGNORECASE)
    text = re.sub(r"\n", " ", text, flags=re.IGNORECASE)
    text = ''.join([c for c in text if c not in punctuation]).lower()
    return text
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(clean)
data.head(5)

Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
41,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,experian has allowed 6 inquiries on my credit ...
121,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,i contacted something the agent verified a my ...
122,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,ic systems re dispute of account no something...
156,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,my something please update this account inform...
202,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,two medical bills showed up on my credit some...


In [7]:
# Data preprocessing
max_words = 11000  # Consider only the top 10,000 words in the dataset
max_len = 2000      # Limit the length of each description text to 100 words


tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['Consumer complaint narrative'])
sequences = tokenizer.texts_to_sequences(data['Consumer complaint narrative'])
X = pad_sequences(sequences, maxlen=max_len)

# Prepare the labels
y_product = pd.get_dummies(data['Product'])
y_sub_product = pd.get_dummies(data['Sub-product'])
y_issue = pd.get_dummies(data['Issue'])
y_sub_issue = pd.get_dummies(data['Sub-issue'])

# Train-test split
X_train, X_test, y_prod_train, y_prod_test, y_sub_prod_train, y_sub_prod_test, y_issue_train, y_issue_test, y_sub_issue_train, y_sub_issue_test = train_test_split(X,
                                                                                                                                                                    y_product, y_sub_product, y_issue, y_sub_issue,
                                                                                                                                                                    test_size=0.2, random_state=7, stratify=y_sub_issue)

In [8]:
# Load pre-trained Word2Vec embeddings
word2vec_model = api.load("word2vec-google-news-300")

# Create the embedding matrix using pre-trained Word2Vec embeddings
word_index = tokenizer.word_index
embedding_dim = 300  # Size of Word2Vec embeddings
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

In [24]:
# Build the model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(input_layer)
lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
dropout_layer = Dropout(0.3)(lstm_layer)
global_maxpooling_layer = GlobalMaxPooling1D()(dropout_layer)
dense_layer = Dense(64, activation='relu')(global_maxpooling_layer)

# Category branch
product_output = Dense(len(data['Product'].unique()), activation='softmax', name='category')(dense_layer)

# Sub-category branch
sub_product_output = Dense(len(data['Sub-product'].unique()), activation='softmax', name='sub_category')(dense_layer)

# Issue branch
issue_output = Dense(len(data['Issue'].unique()), activation='softmax', name='issue')(dense_layer)

# Sub-issue branch
sub_issue_output = Dense(len(data['Sub-issue'].unique()), activation='softmax', name='sub_issue')(dense_layer)


In [25]:
# Create the model
model = Model(inputs=input_layer, outputs=[product_output, sub_product_output, issue_output, sub_issue_output])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
model.fit(X_train, [y_prod_train, y_sub_prod_train, y_issue_train, y_sub_issue_train], 
          epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.callbacks.History at 0x7d8f0e30ba90>

In [26]:
# Evaluate the model
loss, product_loss, sub_product_loss, issue_loss, sub_issue_loss, product_accuracy, sub_product_accuracy, issue_accuracy, sub_issue_accuracy = model.evaluate(X_test, [y_prod_test, y_sub_prod_test, y_issue_test, y_sub_issue_test], verbose=0)

print(f"Total loss: {loss}")
print(f"Product accuracy: {product_accuracy}")
print(f"Sub-product accuracy: {sub_product_accuracy}")
print(f"Issue accuracy: {issue_accuracy}")
print(f"Sub-issue accuracy: {sub_issue_accuracy}")

Total loss: 2.810638427734375
Product accuracy: 0.9196850657463074
Sub-product accuracy: 0.8992125988006592
Issue accuracy: 0.6960629820823669
Sub-issue accuracy: 0.5590550899505615


In [28]:
# Save the trained model
model.save("my_model.h5")

In [29]:
import joblib
joblib.dump(tokenizer, 'tokenizer.pkl')
joblib.dump(max_len, 'max_len.pkl')
joblib.dump(label_mapping_product, 'label_mapping_product.pkl')
joblib.dump(label_mapping_sub_product, 'label_mapping_sub_product.pkl')
joblib.dump(label_mapping_issue, 'label_mapping_issue.pkl')
joblib.dump(label_mapping_sub_issue, 'label_mapping_sub_issue.pkl')

['label_mapping_sub_issue.pkl']