In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing import text, sequence
from keras import utils

In [None]:
df= pd.read_csv('/content/Consumer_Complaints.csv',encoding='utf-8', on_bad_lines='skip',escapechar='\\', sep=',')
df.head(3)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/21/2017,Credit reporting,,Incorrect information on credit report,Information is not mine,,Company has responded to the consumer and the ...,EXPERIAN DELAWARE GP,TX,77075,Older American,,Phone,03/21/2017,Closed with non-monetary relief,Yes,No,2397100.0
1,04/19/2017,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not disclosed as an attempt to collect,,,"Security Credit Services, LLC",IL,60643,,,Web,04/20/2017,Closed with explanation,Yes,No,2441777.0
2,04/19/2017,Credit card,,Other,,,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,62025,,,Referral,04/20/2017,Closed with explanation,Yes,No,2441830.0


In [None]:
print(df.columns.tolist())

['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID']


In [None]:
col=['Consumer complaint narrative','Product']

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df.rename(columns={
    'Consumer complaint narrative': 'Consumer_complaint_narrative',
    'Product': 'Product'
}, inplace=True)

In [None]:
print(df.head())
print(df.info())

  Date received           Product                            Sub-product  \
0    03/21/2017  Credit reporting                                    NaN   
1    04/19/2017   Debt collection  Other (i.e. phone, health club, etc.)   
2    04/19/2017       Credit card                                    NaN   
3    04/14/2017          Mortgage                         Other mortgage   
4    04/19/2017       Credit card                                    NaN   

                                      Issue  \
0    Incorrect information on credit report   
1           Disclosure verification of debt   
2                                     Other   
3  Loan modification,collection,foreclosure   
4                      Credit determination   

                                Sub-issue Consumer_complaint_narrative  \
0                 Information is not mine                          NaN   
1  Not disclosed as an attempt to collect                          NaN   
2                                     

In [None]:
col = ['Consumer_complaint_narrative', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer_complaint_narrative'])]
df.columns = ['Consumer_complaint_narrative', 'Product']

In [None]:
df.head()

Unnamed: 0,Consumer_complaint_narrative,Product
9,Started the refinance of home mortgage process...,Mortgage
35,My wife and I visited the Chase Bank branch at...,Bank account or service
39,The service representative was harsh and not l...,Student loan
46,I have documentation that shows that US Bank w...,Bank account or service
54,Experian reporting of XXXX XXXX reflects a for...,Credit reporting


In [None]:
df.isnull().sum()

Unnamed: 0,0
Consumer_complaint_narrative,0
Product,0


In [None]:
df['Product'].value_counts()

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
Debt collection,38741
Mortgage,32000
Credit reporting,30319
Credit card,18276
Bank account or service,14500
Student loan,10176
Consumer Loan,9029
Payday loan,1695
Money transfers,1437
Prepaid card,1404


In [None]:
train_size=int(len(df)*0.8)
print('Train size: %d'% train_size)
print('Test size: %d'% (len(df)-train_size))

Train size: 126292
Test size: 31573


In [None]:
train_narrative=df['Consumer_complaint_narrative'][:train_size]
train_product=df['Product'][:train_size]

test_narrative=df['Consumer_complaint_narrative'][train_size:]
test_product=df['Product'][train_size:]

In [None]:
max_words=1000
tokenize= text.Tokenizer(num_words=max_words, char_level=False)

tokenize.fit_on_texts(train_narrative)
x_train=tokenize.texts_to_matrix(train_narrative)
x_test=tokenize.texts_to_matrix(test_narrative)

In [None]:
encoder=LabelEncoder()
encoder.fit(train_product)
y_train=encoder.transform(train_product)
y_test=encoder.transform(test_product)

In [None]:
num_classes=np.max(y_train)+1
y_train=utils.to_categorical(y_train,num_classes)
y_test=utils.to_categorical(y_test,num_classes)

In [None]:
print('x_train shape:',x_train.shape)
print('x_test shape:',x_test.shape)
print('y_train shape:',y_train.shape)
print('y_test shape:',y_test.shape)

x_train shape: (126292, 1000)
x_test shape: (31573, 1000)
y_train shape: (126292, 12)
y_test shape: (31573, 12)


In [None]:
model=Sequential()
model.add(Dense(512,input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
batch_size=32
epochs=5

history=model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)

Epoch 1/5
[1m3552/3552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 8ms/step - accuracy: 0.7633 - loss: 0.7827 - val_accuracy: 0.8387 - val_loss: 0.5247
Epoch 2/5
[1m3552/3552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 8ms/step - accuracy: 0.8406 - loss: 0.5036 - val_accuracy: 0.8376 - val_loss: 0.5226
Epoch 3/5
[1m3552/3552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 8ms/step - accuracy: 0.8576 - loss: 0.4438 - val_accuracy: 0.8428 - val_loss: 0.5137
Epoch 4/5
[1m3552/3552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 8ms/step - accuracy: 0.8727 - loss: 0.3917 - val_accuracy: 0.8378 - val_loss: 0.5302
Epoch 5/5
[1m3552/3552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 8ms/step - accuracy: 0.8848 - loss: 0.3505 - val_accuracy: 0.8432 - val_loss: 0.5265


In [None]:
score= model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)
print('test score', score[0])
print('test accuracy', score[1])


[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8409 - loss: 0.5376
test score 0.5317007303237915
test accuracy 0.8423019647598267


In [None]:
test_labels=encoder.classes_

for i in range(10):
  prediction= model.predict(np.array([x_test[i]]))
  predicted_label=test_labels[np.argmax(prediction)]
  print(test_narrative.iloc[i][:50],'...')
  print('Actual Label'+test_product.iloc[i] )
  print('Predicted Label'+predicted_label+'\n')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
On XXXX a fraudulent transaction occurred for the  ...
Actual LabelCredit card
Predicted LabelCredit card

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
I have a a unverified account from ENHANCED RECOVE ...
Actual LabelDebt collection
Predicted LabelDebt collection

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
After an IRA CD matured, Citizens Bank associates, ...
Actual LabelBank account or service
Predicted LabelBank account or service

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
In XXXX of 2008, my husband and I sold our home to ...
Actual LabelCredit reporting
Predicted LabelMortgage

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
I 've recently received a letter threatening legal ...
Actual LabelDebt collection
Predicted LabelDebt collection

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2