In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#Read new processed complaint data set.
ndf = pd.read_csv('processed_complaints.csv')

In [3]:
#Examining dataframe.
ndf

Unnamed: 0.1,Unnamed: 0,Product,Complaint
0,38,Debt Collection,owe money never received bill indicating balan...
1,39,Debt Collection,attempting collect fund valuation service prov...
2,41,Credit Reporting and Services,experian didnt consent inquiry also name incor...
3,68,Banking Services,citibank froze account contained time make col...
4,129,Credit Reporting and Services,accordance fair credit reporting act account v...
...,...,...,...
1112415,3096750,Credit/Prepaid Cards,automated call capital one call home phone exc...
1112416,3096751,Debt Collection,disputed debt several time resolution submitte...
1112417,3096752,Mortgages,father died left son belonging also single mot...
1112418,3096753,Credit Reporting and Services,cfbp would like file complaint experian report...


In [4]:
#Eliminate 'Unnamed:0' column.
ndf.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
ndf

Unnamed: 0,Product,Complaint
0,Debt Collection,owe money never received bill indicating balan...
1,Debt Collection,attempting collect fund valuation service prov...
2,Credit Reporting and Services,experian didnt consent inquiry also name incor...
3,Banking Services,citibank froze account contained time make col...
4,Credit Reporting and Services,accordance fair credit reporting act account v...
...,...,...
1112415,Credit/Prepaid Cards,automated call capital one call home phone exc...
1112416,Debt Collection,disputed debt several time resolution submitte...
1112417,Mortgages,father died left son belonging also single mot...
1112418,Credit Reporting and Services,cfbp would like file complaint experian report...


In [6]:
#Check how many null values are in the Complaint column.
ndf['Complaint'].isnull().sum()

81

In [7]:
#Eliminate all null values.
ndf = ndf.dropna()

In [8]:
ndf['Complaint'].isnull().sum()

0

In [9]:
ndf.shape

(1112339, 2)

# Encoding Product feature.

In [10]:
#View list of products in Product feature.
ndf['Product'].unique()

array(['Debt Collection', 'Credit Reporting and Services',
       'Banking Services', 'Mortgages', 'Credit/Prepaid Cards', 'Loans',
       'Crypto Currency'], dtype=object)

In [11]:
#Encode products in numeric values.
ndf['Product'].replace({'Debt Collection' : 0, 
                        'Credit Reporting and Services' : 1,
                        'Banking Services' : 2,
                        'Mortgages' : 3,
                        'Credit/Prepaid Cards' : 4,
                        'Loans' : 5,
                        'Crypto Currency' : 6}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['Product'].replace({'Debt Collection' : 0,


In [12]:
ndf

Unnamed: 0,Product,Complaint
0,0,owe money never received bill indicating balan...
1,0,attempting collect fund valuation service prov...
2,1,experian didnt consent inquiry also name incor...
3,2,citibank froze account contained time make col...
4,1,accordance fair credit reporting act account v...
...,...,...
1112415,4,automated call capital one call home phone exc...
1112416,0,disputed debt several time resolution submitte...
1112417,3,father died left son belonging also single mot...
1112418,1,cfbp would like file complaint experian report...


# Generate training and test set.

In [13]:
#Create train and test sets.
x = ndf['Complaint']
y = ndf['Product']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [15]:
print("Training set size:" , x_train.shape)
print("Test set size:" , x_test.shape)

Training set size: (889871,)
Test set size: (222468,)


# Obtaining relevance of words from complaint narratives.

In [16]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000000)
tfidf_x_train = tfidf.fit_transform(x_train)
tfidf_x_test = tfidf.transform(x_test)