In [3]:
# 使用Tensorflow + Keras 实现文本分类

import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import  sequence
from keras import utils
from tensorflow.keras.utils import to_categorical

print("You have Tensorflow version", tf.__version__)

You have Tensorflow version 2.16.1


In [4]:
df = pd.read_csv('./Consumer_Complaints.csv', encoding='latin-1')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,3/17/2014,Closed with explanation,Yes,No,759217
1,10/1/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/5/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,6/8/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,6/10/2014,Closed with explanation,Yes,Yes,885638
4,9/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,9/13/2014,Closed with explanation,Yes,Yes,1027760


In [5]:
# 3. 只查看我们关注的两列,去除空值
col = ['Consumer complaint narrative', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer complaint narrative'])]
df.head()

Unnamed: 0,Consumer complaint narrative,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection


In [6]:
# 4. 是否存在空值
df.isnull().sum()

Consumer complaint narrative    0
Product                         0
dtype: int64

In [7]:
# 5. 各种Product出现的次数值
df['Product'].value_counts()

Product
Debt collection                                                                 47915
Mortgage                                                                        36582
Credit reporting                                                                31592
Credit card                                                                     18842
Bank account or service                                                         14888
Credit reporting, credit repair services, or other personal consumer reports    14671
Student loan                                                                    13304
Consumer Loan                                                                    9474
Credit card or prepaid card                                                      3355
Checking or savings account                                                      2142
Payday loan                                                                      1748
Money transfers                               

In [None]:
# 6. 真实的Data Science 干活之前，先要问下自己几个业务问题。
# 我们要干啥，要关注哪些数据。这些占据了 20%的时间。
# 我们这里不关注业务，我们只关注我们构建模型的流程

In [8]:
# 7. train_size 和 test_size
# Split data into train and test
train_size = int(len(df) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(df) - train_size))

Train size: 159976
Test size: 39994


In [9]:
# 8. 数据分割

train_narrative = df["Consumer complaint narrative"][:train_size]
train_product = df["Product"][:train_size]

test_narrative = df["Consumer complaint narrative"][train_size:]
test_product = df["Product"][train_size:]


In [10]:
# 9.将句子中的各个单词转换成矩阵，一个句子最多1000个单词[还是说所有句子1000个单词，到时候看下Keras的文档]
# 看一下x_train和x_test的大小
# 所有句子只保留前1000个最频繁出现的单词
# num_words: Optional. The maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept. (Default is None, which means all words will be kept.)
max_words = 1000
tokenize = Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_narrative)
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [11]:
# 10. 把Label string转换成数字索引
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [12]:
# 11. 将labels转换成one-hot表示

num_classes = np.max(y_train) + 1
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)


In [13]:
# 12. 查看一下转换出来的特征和 label大小
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (159976, 1000)
x_test shape: (39994, 1000)
y_train shape: (159976, 18)
y_test shape: (39994, 18)


In [14]:
# 13. 构建并编译模型
# Dropout，每次反向传播的时候，只更新部分神经元的weight，
# 这样训练效果更好

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
# 14. 开始训练模型
batch_size = 32
epochs = 5

history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/5


2024-04-20 06:56:30.541355: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 575912000 exceeds 10% of free system memory.


[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5ms/step - accuracy: 0.7425 - loss: 0.8879 - val_accuracy: 0.8322 - val_loss: 0.5539
Epoch 2/5
[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 6ms/step - accuracy: 0.8129 - loss: 0.6106 - val_accuracy: 0.8356 - val_loss: 0.5344
Epoch 3/5
[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 6ms/step - accuracy: 0.8295 - loss: 0.5499 - val_accuracy: 0.8393 - val_loss: 0.5349
Epoch 4/5
[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.8435 - loss: 0.4952 - val_accuracy: 0.8399 - val_loss: 0.5332
Epoch 5/5
[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.8560 - loss: 0.4507 - val_accuracy: 0.8370 - val_loss: 0.5485


In [16]:
# loss要下降，accuracy要上升，就是我们所期望的

In [17]:
# 15. 在测试集上测试，查看精度

score = model.evaluate(x_test, y_test, 
                        batch_size=batch_size,
                      verbose=1)
print(score)
print('Test score:', score[0])
print('Test accuracy:', score[1])

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6561 - loss: 1.3481
[1.960322380065918, 0.5246286988258362]
Test score: 1.960322380065918
Test accuracy: 0.5246286988258362


In [18]:
# 16. 单个预测结果，使用
text_labels = encoder.classes_

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predict_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], '...')
    print('Actual Label: ' + test_product.iloc[i])
    print('Predicted Label: ' + predict_label + '\n')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
When The President came out with the HARP program  ...
Actual Label: Mortgage
Predicted Label: Mortgage

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I filed an dispute with Capital One Bank on XX/XX/ ...
Actual Label: Credit card
Predicted Label: Credit card

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I am disputing account number XXXX with Midland Fu ...
Actual Label: Debt collection
Predicted Label: Debt collection

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
I opened a BarclayCard on XXXX XXXX to help rebuil ...
Actual Label: Credit card
Predicted Label: Credit card

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
My mortgage was sold to OCWEN by GMAC. With GMAC,  ...
Actual Label: Mortgage
Predicted Label: Mortgage

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
I contacted the b