In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

df =pd.read_csv("drive/MyDrive/CMPE-257-Project/complaints.csv")
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2022-11-22,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,,,"EQUIFAX, INC.",OK,74447.0,Servicemember,,Web,2022-11-22,In progress,Yes,,6231498
1,2022-11-16,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,,,Experian Information Solutions Inc.,NJ,7093.0,,,Web,2022-11-16,In progress,Yes,,6205383
2,2022-11-09,Debt collection,Credit card debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Genesis FS Card Services, Inc.",TX,75038.0,,,Web,2022-11-09,Closed with explanation,Yes,,6182699
3,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,,,NORTHERN OHIO INVESTMENT COMPANY,MO,63031.0,,Other,Web,2022-11-08,Untimely response,No,,6173945
4,2022-11-07,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,,,"R & R Collection Service, Inc.",MO,64154.0,,,Phone,2022-11-07,Untimely response,No,,6175998


Importing all required libraries

In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Restricting the columns to only the Product(Class) and the Complaint text.

In [None]:
df = df[['Product' , 'Consumer complaint narrative']]
df = df.rename(columns={'Product' : "Product",
                       'Consumer complaint narrative' : "Complaint"})
df.head()

Unnamed: 0,Product,Complaint
0,"Credit reporting, credit repair services, or o...",
1,"Credit reporting, credit repair services, or o...",
2,Debt collection,
3,Mortgage,
4,Debt collection,


Cleaning up the Product column data.

In [None]:
#Clean up data by renaming similar products into one category of products.
df['Product'].replace({'Bank account or service': 'Banking Services',
                       'Checking or savings account' : 'Banking Services',
                       'Consumer Loan' : 'Loans',
                       'Credit card' : 'Credit/Prepaid Cards',
                       'Credit card or prepaid card' : 'Credit/Prepaid Cards',
                       'Credit reporting' : 'Credit Reporting and Services',
                       'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting and Services',
                       'Debt collection' : 'Debt Collection',
                       'Money transfer, virtual currency, or money service' : 'Banking Services',
                       'Money transfers' : 'Banking Services',
                       'Mortgage' : 'Mortgages',
                       'Other financial service' : 'Banking Services',
                       'Payday loan' : 'Loans',
                       'Payday loan, title loan, or personal loan' : 'Loans',
                       'Prepaid card' : 'Credit/Prepaid Cards',
                       'Student loan' : 'Loans',
                       'Vehicle loan or lease' : 'Loans',
                       'Virtual currency' : 'Crypto Currency'}, inplace=True)

Removing the NaN values. For this project, we are only concerned with complaint text.

In [None]:
df = df[pd.notnull(df['Complaint'])]
df.head()

Unnamed: 0,Product,Complaint
38,Debt Collection,XXXX. I do not owe any money to XXXX XXXX. I ...
39,Debt Collection,XXXX is attempting to collect funds for Valuat...
41,Credit Reporting and Services,EXPERIAN I didnt consent to these Inquiries Al...
68,Banking Services,Citibank froze my account that contained {$200...
129,Credit Reporting and Services,In accordance with the fair credit reporting a...


Building a CountVectorizer - This includes Text pre-processing, tokenizing and filtering of stopwords.

In [None]:
stop_words = stopwords.words('english') + list(string.punctuation)
stop_words += ["''", '""', '...', '``', '--', 'xxxx']

In [None]:
#Tokenize complaint data and remove stop words from complaint narrative.
def processComplaint(comp):
    tokens = nltk.word_tokenize(comp)
    removed_stop_words = [token.lower() for token in tokens if token.lower() not in stop_words]
    new_removed_stop_words = [word for word in removed_stop_words if word.isalpha()]
    
    return new_removed_stop_words

In [None]:
#Link words together.
def linkWords(words):
    linked_words = ''
    
    for w in words:
        linked_words += w + ' '
    
    return linked_words.strip()

In [None]:
lm = WordNetLemmatizer()

In [None]:
#Group variants of the same word and merge complaints.
def groupVariants(words):
    words = [word for word in words if word is not np.nan]
    
    lem_list = []
    
    for idx, word in enumerate(words):
        lem_list.append(lm.lemmatize(word))
    
    linked_str = linkWords(lem_list)
    
    return linked_str

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Eliminate stop words and group variants of words.
for i in range(len(df)):
    processed_complaints = processComplaint(df['Complaint'].iloc[i])
    complaint = groupVariants(processed_complaints)
    
    df['Complaint'].iloc[i] = complaint


Now, we'll consider the pre-processed complaints for further analysis.

In [7]:
import pandas as pd
ndf = pd.read_csv("drive/MyDrive/CMPE-257-Project/processed_complaints.csv")
ndf.head()

Unnamed: 0.1,Unnamed: 0,Product,Complaint
0,38,Debt Collection,owe money never received bill indicating balan...
1,39,Debt Collection,attempting collect fund valuation service prov...
2,41,Credit Reporting and Services,experian didnt consent inquiry also name incor...
3,68,Banking Services,citibank froze account contained time make col...
4,129,Credit Reporting and Services,accordance fair credit reporting act account v...


In [8]:
ndf.drop(['Unnamed: 0'], axis=1, inplace=True)
#Eliminate all null values.
ndf = ndf.dropna()
ndf['Product'].unique()


array(['Debt Collection', 'Credit Reporting and Services',
       'Banking Services', 'Mortgages', 'Credit/Prepaid Cards', 'Loans',
       'Crypto Currency'], dtype=object)

In [9]:
ndf['Product'].replace({'Debt Collection' : 0, 
                        'Credit Reporting and Services' : 1,
                        'Banking Services' : 2,
                        'Mortgages' : 3,
                        'Credit/Prepaid Cards' : 4,
                        'Loans' : 5,
                        'Crypto Currency' : 6}, inplace=True)

In [15]:
#Create train and test sets.
sample_df = ndf.sample(100000, random_state=42)
x = sample_df['Complaint']
y = sample_df['Product']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [12]:
print("Training set size:" , x_train.shape)
print("Test set size:" , x_test.shape)

Training set size: (889871,)
Test set size: (222468,)


Performing a Tf-IDF transformation.

In [20]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000000)
tfidf_x_train = tfidf.fit_transform(x_train)
tfidf_x_test = tfidf.transform(x_test)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=7)

clf = KNeighborsClassifier().fit(tfidf_x_train, y_train)

In [22]:
#Predictions for training and testing set.
pred_y_train = clf.predict(tfidf_x_train)
pred_y_test = clf.predict(tfidf_x_test)

print('Training prediction accuracy: ', accuracy_score(y_train, pred_y_train))
print('Testing prediction accuracy: ', accuracy_score(y_test, pred_y_test))

Training prediction accuracy:  0.808175
Testing prediction accuracy:  0.71515


In [23]:
print('Classification Report for KNN (Training Data)\n', classification_report(y_train, pred_y_train))

Classification Report for KNN (Training Data)
               precision    recall  f1-score   support

           0       0.67      0.85      0.75     13893
           1       0.82      0.96      0.88     39075
           2       0.91      0.66      0.77      6965
           3       0.95      0.69      0.80      7108
           4       0.88      0.47      0.61      7422
           5       0.96      0.42      0.59      5536
           6       0.00      0.00      0.00         1

    accuracy                           0.81     80000
   macro avg       0.74      0.58      0.63     80000
weighted avg       0.83      0.81      0.80     80000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print('Classification Report for Naive Bayes (Test Data)\n', classification_report(y_test, pred_y_test))

Classification Report for Naive Bayes (Test Data)
               precision    recall  f1-score   support

           0       0.53      0.67      0.60      3456
           1       0.75      0.91      0.82      9749
           2       0.83      0.57      0.67      1784
           3       0.84      0.59      0.69      1760
           4       0.71      0.34      0.46      1803
           5       0.79      0.31      0.44      1447
           6       0.00      0.00      0.00         1

    accuracy                           0.72     20000
   macro avg       0.64      0.48      0.53     20000
weighted avg       0.73      0.72      0.70     20000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
