In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#Read new processed complaint data set.
ndf = pd.read_csv('processed_complaints.csv')

In [3]:
#Examining dataframe.
ndf

Unnamed: 0.1,Unnamed: 0,Product,Complaint
0,38,Debt Collection,owe money never received bill indicating balan...
1,39,Debt Collection,attempting collect fund valuation service prov...
2,41,Credit Reporting and Services,experian didnt consent inquiry also name incor...
3,68,Banking Services,citibank froze account contained time make col...
4,129,Credit Reporting and Services,accordance fair credit reporting act account v...
...,...,...,...
1112415,3096750,Credit/Prepaid Cards,automated call capital one call home phone exc...
1112416,3096751,Debt Collection,disputed debt several time resolution submitte...
1112417,3096752,Mortgages,father died left son belonging also single mot...
1112418,3096753,Credit Reporting and Services,cfbp would like file complaint experian report...


In [4]:
#Eliminate 'Unnamed:0' column.
ndf.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
#Check how many null values are in the Complaint column.
print(ndf['Product'].isnull().sum())
print(ndf['Complaint'].isnull().sum())

0
81


In [6]:
#Eliminate all null values.
ndf = ndf.dropna()

In [8]:
ndf = ndf.sample(n=100000, random_state=42)

In [9]:
ndf

Unnamed: 0,Product,Complaint
883878,Credit Reporting and Services,upon checking credit going credit report found...
725549,Mortgages,ocwen h overcharged amount mortgage account st...
385025,Credit/Prepaid Cards,friday attempted make payment payment appear s...
774159,Credit/Prepaid Cards,noticed credit card company charged late fee i...
305791,Mortgages,business company refinancing mortgage process ...
...,...,...
881495,Debt Collection,xxxxxxxx consumer sent letter via usps certifi...
966270,Credit Reporting and Services,disputed item circled attached report failed a...
898572,Credit Reporting and Services,fair credit reporting act fcra following may s...
768169,Credit Reporting and Services,year collection company dinged credit rating b...


In [10]:
ndf.shape

(100000, 2)

# Encoding Product feature.

In [11]:
#View list of products in Product feature.
ndf['Product'].unique()

array(['Credit Reporting and Services', 'Mortgages',
       'Credit/Prepaid Cards', 'Debt Collection', 'Banking Services',
       'Loans', 'Crypto Currency'], dtype=object)

In [12]:
#Encode products in numeric values.
ndf['Product'].replace({'Debt Collection' : 0, 
                        'Credit Reporting and Services' : 1,
                        'Banking Services' : 2,
                        'Mortgages' : 3,
                        'Credit/Prepaid Cards' : 4,
                        'Loans' : 5,
                        'Crypto Currency' : 6}, inplace=True)

In [13]:
ndf.isna().sum()

Product      0
Complaint    0
dtype: int64

# Generate training and test set.

In [15]:
#Create train and test sets.
x = ndf['Complaint']
y = ndf['Product']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [17]:
print("Training set size:" , x_train.shape)
print("Test set size:" , x_test.shape)

Training set size: (80000,)
Test set size: (20000,)


# Obtaining relevance of words from complaint narratives.

In [18]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000000)
tfidf_x_train = tfidf.fit_transform(x_train)
tfidf_x_test = tfidf.transform(x_test)

# Modeling with Multinominal Naive Bayes classifier.

In [19]:
nb = MultinomialNB()
nb.fit(tfidf_x_train, y_train)

MultinomialNB()

In [20]:
#Predictions for training and testing set.
pred_y_train = nb.predict(tfidf_x_train)
pred_y_test = nb.predict(tfidf_x_test)

In [21]:
print('Training prediction accuracy: ', accuracy_score(y_train, pred_y_train))
print('Testing prediction accuracy: ', accuracy_score(y_test, pred_y_test))

Training prediction accuracy:  0.618375
Testing prediction accuracy:  0.56935


In [22]:
print('Classification Report for Naive Bayes (Training Data)\n', classification_report(y_train, pred_y_train))

Classification Report for Naive Bayes (Training Data)
               precision    recall  f1-score   support

           0       0.95      0.31      0.47     13893
           1       0.56      1.00      0.72     39075
           2       0.97      0.34      0.51      6965
           3       0.99      0.37      0.54      7108
           4       0.98      0.12      0.22      7422
           5       1.00      0.03      0.07      5536
           6       0.00      0.00      0.00         1

    accuracy                           0.62     80000
   macro avg       0.78      0.31      0.36     80000
weighted avg       0.77      0.62      0.55     80000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
cmat_train = confusion_matrix(y_train, pred_y_train)

cmat_train_disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cmat_train)
cmat_train_disp.plot()

print('Confusion Matrix for Naive Bayes (Train)')
plt.show()