# Data Exploration - Team Gambinos

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [77]:
complaints = pd.read_csv('../data/complaints.csv')

We decided as a team to adopt the following naming convention for consistency:

In [78]:
complaints.columns = [x.replace('Consumer ', '').lower().replace(' ', '_') for x in complaints.columns]
complaints['complaint_length'] = complaints['complaint_narrative'].apply(len)


In [79]:
complaints['issue'].value_counts()

Incorrect information on your report    229305
Attempts to collect debt not owed        73163
Communication tactics                    21243
Struggling to pay mortgage               17374
Fraud or scam                            12347
Name: issue, dtype: int64

In [80]:
complaints_dict = {'Attempts to collect debt not owed': 1, 
                 'Communication tactics': 2, 
                 'Fraud or scam': 3, 
                 'Incorrect information on your report': 4,
                 'Struggling to pay mortgage': 5}

In [81]:
# to replace the text in issue with target variables
complaints = complaints.replace({'issue': complaints_dict})

In [82]:
complaints['complaint_narrative']

0         My name is XXXX XXXX this complaint is not mad...
1         I searched on XXXX for XXXXXXXX XXXX  and was ...
2         I have a particular account that is stating th...
3         I have not supplied proof under the doctrine o...
4         Hello i'm writing regarding account on my cred...
                                ...                        
353427           Collections account I have no knowledge of
353428    Dear CFPB Team, The reason for my complaint is...
353429    FRCA violations : Failing to Follow Debt Dispu...
353430    My Father, a XXXX XXXX  acquired an HECM rever...
353431    I have tried to contact cash app about a fraud...
Name: complaint_narrative, Length: 353432, dtype: object

In [83]:
seed = 123
for complaint in complaints.loc[complaints['issue'] == 4, 'complaint_narrative'].sample(3, random_state=seed):
    print(complaint)
    print('-----------------------------')

I just reviewed a copy of my Experian credit report and found the following information to be an error. I am a victim of identity theft and did not make these charges. I reported the theft of my identity to the Federal Trade Commission.
-----------------------------
after my legal separation from my husband he started to open credit in my name with no permission I have a legal case against him but can not find him he has ruined my life his name is XXXX XXXX XXXX   remove I have disputed by mail with all three credit bureaus. act # XXXX owing an alleged {$2200.00} XXXX   XXXX  orig creditor XXXX
-----------------------------
Equifax report a collections account " XXXX XXXX XXXX ''. I have no knowledge of this account. I have not been furnished any proof or verifications of this account. I don't have any signed contract agreements with XXXX XXXX XXXX, verbal agreements nor any paperwork associated with them. A proper investigation according to FCRA was not conducted, word of mouth from t

In [84]:
# remove double x-s or more
complaints['complaint_narrative'] = complaints['complaint_narrative'].str.replace(r'[xX][xX]+','', regex=True)

# remove double __ or more
complaints['complaint_narrative'] = complaints['complaint_narrative'].str.replace(r'__+','', regex=True)

# stripping all numeric values from complaints
complaints['complaint_narrative'] = complaints['complaint_narrative'].str.replace('\d+', '', regex=True)

# removing digit removes "4927" words

In [85]:
#complaints.groupby('issue')['complaint_narrative'].apply(lambda x: x.str.extractall(r'(\w+X+\w+)').groupby(0).size().reset_index(name='count'))

In [86]:
complaints.loc[complaints['complaint_narrative'].str.find('TOXIC') != -1, 'complaint_narrative'].iloc[3]

'  and      , CA  NATIONSTAR/.   #  We need immediate assistance with our   property loan.     is the investor of the property and   is the servicer for the loan. \n\nOur property has a domestic water well and the water is unsafe to drink. The significant and very serious water contaminations in the aquifers that serve our water well include : Lead Hexavalent Chromium ( documented in the movie entitled   ) and Arsenic. These chemicals are listed on the CA Proposition  List of Contaminants that Cause Cancer or Reproductive Harm. Serial testing of the water well by a CA Certified Laboratory defined the contamination to be steady-state as we were informed by the US EPA. There is no filtration system that can reduce these high levels down to potable safe drinking water levels. Our home is now defined* as : HAZARDOUS TOXIC CONTAMINATED UNINHABITABLE POLLUTED NON-POTABLE WATER BLIGHTED AND A NUISANCE ( *as defined by the CA List of Contaminants Proposition  State of CA, EPA Office of Environ

In [87]:
X = complaints[['complaint_narrative']]
y = complaints['issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [88]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t).lower()
                for t in regexp_tokenize(doc, r'(?u)\b\w\w+\b')
                if t.lower() not in stop_words] #word_tokenize(doc)

In [109]:
# note: the default stop words remove 311 words; have to specify it, otherwise it's off
#vect = CountVectorizer(lowercase=True, stop_words='english', ) 

#r”(?u)\b\w\w+\b”
stop_words = set(stopwords.words('english'))
vect = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=2, max_df=0.8)  

X_train_vec = vect.fit_transform(X_train['complaint_narrative'])
X_test_vec = vect.transform(X_test['complaint_narrative'])



In [110]:
vect.vocabulary_

{'filed': 11510,
 'identity': 14264,
 'theft': 28985,
 'report': 24375,
 'learned': 16801,
 'someone': 27042,
 'using': 31185,
 'incarceration': 14745,
 'submitted': 27953,
 'documentation': 9144,
 'transunion': 29817,
 'disregarded': 8930,
 'even': 10547,
 'block': 3590,
 'account': 301,
 'day': 7411,
 'upon': 31066,
 'receiving': 23395,
 'information': 15149,
 'also': 1290,
 'received': 23385,
 'letter': 16950,
 'irs': 16009,
 'stating': 27538,
 'tax': 28643,
 'return': 24976,
 'name': 18971,
 'u': 30169,
 'dept': 8085,
 'educatio': 9662,
 'acct': 362,
 'closed': 5317,
 'send': 26079,
 'written': 32556,
 'notification': 19579,
 'reported': 24385,
 'consumer': 6321,
 'reporting': 24399,
 'agency': 944,
 'never': 19259,
 'ever': 10556,
 'late': 16679,
 'although': 1311,
 'updated': 31029,
 'chargeoff': 4897,
 'providing': 22660,
 'severe': 26259,
 'scandalous': 25731,
 'action': 507,
 'destroying': 8248,
 'perfect': 21171,
 'payment': 21001,
 'history': 13769,
 'credit': 6981,
 'battle

In [112]:
len(vect.vocabulary_)

32784

### Naive Bayes First

In [113]:
word_counts = pd.DataFrame({
    'words': vect.get_feature_names_out(),
    'frequency': np.array(X_train_vec.sum(axis = 0)).flatten()
})

word_counts.head(20)

Unnamed: 0,words,frequency
0,_acct,2
1,_addresses,6
2,_chapter,4
3,_collections,2
4,_dept,2
5,_employment,8
6,_hard,3
7,_identity,3
8,_included,2
9,_local,3


In [114]:
word_counts[word_counts['frequency'] < 3]

Unnamed: 0,words,frequency
0,_acct,2
3,_collections,2
4,_dept,2
8,_included,2
10,_me,2
...,...,...
32761,zestfinance,2
32762,zestimate,2
32764,zi,2
32778,zoning,2


In [115]:
word_counts.sort_values('frequency', ascending=False).head(25)

Unnamed: 0,words,frequency
6981,credit,553340
301,account,509923
24375,report,359288
15149,information,290532
24399,reporting,213314
7492,debt,182786
21001,payment,172136
6321,consumer,168238
5773,company,133872
2836,balance,120282


In [116]:
from sklearn.naive_bayes import MultinomialNB

In [117]:
nb = MultinomialNB().fit(X_train_vec, y_train)

y_pred = nb.predict(X_test_vec)

In [118]:
accuracy_score(y_test, y_pred)

0.8202313316281491

In [119]:
confusion_matrix(y_test, y_pred)

array([[11629,  2133,   472,  3770,   287],
       [  556,  4533,    49,    96,    77],
       [   60,    62,  2825,   109,    31],
       [ 4822,   880,   771, 49259,  1594],
       [   29,    36,    17,    33,  4228]])

In [120]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.68      0.64      0.66     18291
           2       0.59      0.85      0.70      5311
           3       0.68      0.92      0.78      3087
           4       0.92      0.86      0.89     57326
           5       0.68      0.97      0.80      4343

    accuracy                           0.82     88358
   macro avg       0.71      0.85      0.77     88358
weighted avg       0.83      0.82      0.82     88358



### LogReg

In [121]:
logreg = LogisticRegression(max_iter = 10000).fit(X_train_vec, y_train)

y_pred = logreg.predict(X_test_vec)

print(accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

0.8687724937187351
[[11942   780   122  5370    77]
 [  912  3987    29   348    35]
 [  148    28  2630   258    23]
 [ 2574   129   105 54356   162]
 [  102    43    31   319  3848]]
              precision    recall  f1-score   support

           1       0.76      0.65      0.70     18291
           2       0.80      0.75      0.78      5311
           3       0.90      0.85      0.88      3087
           4       0.90      0.95      0.92     57326
           5       0.93      0.89      0.91      4343

    accuracy                           0.87     88358
   macro avg       0.86      0.82      0.84     88358
weighted avg       0.86      0.87      0.87     88358



In [122]:
vect = TfidfVectorizer(min_df=2, max_df=0.8)

X_train_vec = vect.fit_transform(X_train['complaint_narrative'])
X_test_vec = vect.transform(X_test['complaint_narrative'])

In [123]:
logreg = LogisticRegression(max_iter = 10000).fit(X_train_vec, y_train)

y_pred = logreg.predict(X_test_vec)

print(accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

0.8826591819642817
[[12936   616   107  4537    95]
 [  903  4115    18   248    27]
 [  102    25  2697   250    13]
 [ 2766    74    62 54247   177]
 [   46    28    11   263  3995]]
              precision    recall  f1-score   support

           1       0.77      0.71      0.74     18291
           2       0.85      0.77      0.81      5311
           3       0.93      0.87      0.90      3087
           4       0.91      0.95      0.93     57326
           5       0.93      0.92      0.92      4343

    accuracy                           0.88     88358
   macro avg       0.88      0.84      0.86     88358
weighted avg       0.88      0.88      0.88     88358



## Exploring the incorrect classifications

In [102]:
complaints_dict

{'Attempts to collect debt not owed': 1,
 'Communication tactics': 2,
 'Fraud or scam': 3,
 'Incorrect information on your report': 4,
 'Struggling to pay mortgage': 5}

First look at where 1 and 2 get switched around

In [63]:
y_test

49201     4
46316     1
175191    4
253139    4
290709    4
         ..
332664    4
150037    3
13558     4
82158     4
340823    5
Name: issue, Length: 88358, dtype: int64

In [58]:
# I could also turn into a pandas and have double index
print(np.where(y_test == 1)[0])
print(np.where(y_pred == 1)[0])

[    1     7    12 ... 88335 88341 88344]
[    1    11    13 ... 88335 88341 88344]


In [60]:
len(np.intersect1d(np.where(y_test == 1)[0], np.where(y_pred == 1)[0]))

12942

In [72]:
y_test.loc[1].index

AttributeError: 'numpy.int64' object has no attribute 'index'