In [42]:
import pandas as pd

# Load the data from the JSON file
complaints_data = pd.read_json("complaints-2023-08-25_18_02.json")

# Display the first few rows of the data
complaints_data.head()

Unnamed: 0,_index,_type,_id,_score,_source,sort
0,complaint-public-v2,_doc,5471601,,"{'product': 'Checking or savings account', 'co...",[16]
1,complaint-public-v2,_doc,5020019,,"{'product': 'Checking or savings account', 'co...",[108]
2,complaint-public-v2,_doc,7203230,,"{'product': 'Vehicle loan or lease', 'complain...",[136]
3,complaint-public-v2,_doc,3743284,,"{'product': 'Checking or savings account', 'co...",[156]
4,complaint-public-v2,_doc,2927362,,"{'product': 'Credit reporting, credit repair s...",[188]


load data: narratives of issues of Vehicle loan or lease

In [43]:
# Extract the relevant details from the _source column
complaints_details = complaints_data['_source'].apply(pd.Series)

# Filter entries where product is "Vehicle loan or lease"
vehicle_complaints = complaints_details[complaints_details['product'] == 'Vehicle loan or lease']

# Filter out entries without narratives
vehicle_complaints_with_narrative = vehicle_complaints[vehicle_complaints['complaint_what_happened'].notnull()]
vehicle_complaints_with_narrative

# Retrieve only the issues and the related narratives
issues_and_narratives = vehicle_complaints_with_narrative[['issue', 'complaint_what_happened']]
issues_and_narratives.head()

# Filter out entries with empty narratives
cleaned_issues_and_narratives = issues_and_narratives[issues_and_narratives['complaint_what_happened'].str.strip() != ""]
cleaned_issues_and_narratives.head()

Unnamed: 0,issue,complaint_what_happened
8,Managing the loan or lease,Yes I called ally bank about getting exstentio...
13,Managing the loan or lease,This issue has persisted for multiple years de...
17,Getting a loan or lease,I received notice dated XX/XX/2019 from Ally B...
19,Managing the loan or lease,Shortly after purchasing the vehicle I cancele...
23,Problems at the end of the loan or lease,Ally Financial was the lender used to lease a ...


In [44]:
# Recount the issues after filtering out empty narratives and rank them in descending order
cleaned_issue_counts = cleaned_issues_and_narratives['issue'].value_counts()
cleaned_issue_counts

Managing the loan or lease                                                          612
Problems at the end of the loan or lease                                            467
Struggling to pay your loan                                                         312
Getting a loan or lease                                                             230
Incorrect information on your report                                                142
Problem with a credit reporting company's investigation into an existing problem     90
Improper use of your report                                                          31
Credit monitoring or identity theft protection services                               5
Unable to get your credit report or credit score                                      2
Problem with fraud alerts or security freezes                                         1
Name: issue, dtype: int64

Step 1: Data Preprocessing

In [45]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# 1.1 Select the top 5 issues
top_5_issues = cleaned_issue_counts.head(5).index
filtered_data = cleaned_issues_and_narratives[cleaned_issues_and_narratives['issue'].isin(top_5_issues)]

# 1.2 Tokenize and pad the narratives
MAX_WORDS = 20000  # number of words to consider as features
MAXLEN = 400  # cut off the texts after this number of words

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(filtered_data['complaint_what_happened'])
sequences = tokenizer.texts_to_sequences(filtered_data['complaint_what_happened'])
data_padded = pad_sequences(sequences, maxlen=MAXLEN)

# 1.3 Convert issue labels into categorical values
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(filtered_data['issue'])
labels_categorical = to_categorical(labels_encoded)

data_padded.shape, labels_categorical.shape


((1763, 400), (1763, 5))

Step 2: Model Building

2.1 Model Architecture:

Embedding Layer: This layer will convert tokenized words into dense vectors.

Dense Layers: A couple of dense layers for further processing.

Dropout Layers: To prevent overfitting.

Output Layer: With 5 nodes and a softmax activation.

In [46]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten

# Define the model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAXLEN))

# Flatten the 3D tensor of embeddings into a 2D tensor
model.add(Flatten())

# Dense Layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.7))

# Output Layer
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_27 (Embedding)    (None, 400, 128)          2560000   
                                                                 
 flatten_27 (Flatten)        (None, 51200)             0         
                                                                 
 dense_58 (Dense)            (None, 64)                3276864   
                                                                 
 dropout_31 (Dropout)        (None, 64)                0         
                                                                 
 dense_59 (Dense)            (None, 32)                2080      
                                                                 
 dropout_32 (Dropout)        (None, 32)                0         
                                                                 
 dense_60 (Dense)            (None, 5)               

Step 3: Data Splitting

Split the data into training, validation, and test sets. We'll use an 80-10-10 split for demonstration.

In [47]:
from sklearn.model_selection import train_test_split

# Splitting data into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(data_padded, labels_categorical, test_size=0.1, stratify=labels_categorical)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.11, stratify=y_temp)  # 0.11 x 0.9 = 0.099


Step 4: Model Training

Train the model using the training set and validate it with the validation set.

In [48]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


Step 5: Evaluation

Evaluate the model's performance on the test set.

In [50]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

Test Accuracy: 46.89%
