In [1]:
#importing all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras import regularizers

Load the dataset

In [2]:
complaints = pd.read_csv("complaints.csv")
df = complaints[['Consumer complaint narrative','Product']]
df = df.rename(columns={'Consumer complaint narrative': "narrative"})
df = df[df.narrative.notnull()]

In [3]:
df['word_count'] = df['narrative'].str.split().str.len()

In [4]:
df = (df[df['word_count'] >= 10])

In [5]:
df.loc[(df['Product'] == 'Credit reporting') | 
       (df['Product']=='Credit reporting, credit repair services, or other personal consumer reports'), 
       'Product'] = 'Credit or consumer reporting, credit repair services'

df.loc[(df['Product'] == 'Credit card') | 
       (df['Product']=='Credit card or prepaid card') |
       (df['Product'] == 'Prepaid card'), 
       'Product'] = 'Credit card or prepaid card'

df.loc[(df['Product'] == 'Money transfer, virtual currency, or money service') | 
       (df['Product']=='Money transfers') |
       (df['Product'] == 'Virtual currency'), 
       'Product'] = 'Money transfer or service, virtual currency'

df.loc[(df['Product'] == 'Payday loan') | 
       (df['Product']=='Payday loan, title loan, or personal loan') |
       (df['Product'] == 'Consumer Loan'), 
       'Product'] = 'Personal loan'

df.loc[(df['Product'] == 'Bank account or service') | 
       (df['Product']=='Other financial service'), 
       'Product'] = 'Other financial service'

In [6]:
df = (df[df['Product'] != 'Other financial service'])

In [7]:
df

Unnamed: 0,narrative,Product,word_count
1,transworld systems inc. \nis trying to collect...,Debt collection,18
3,"Over the past 2 weeks, I have been receiving e...",Debt collection,78
6,Pioneer has committed several federal violatio...,Debt collection,152
8,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...","Credit or consumer reporting, credit repair se...",171
9,Hello This complaint is against the three cred...,"Credit or consumer reporting, credit repair se...",428
...,...,...,...
1823676,1 : Mailing Address is incorrect. \n2 : Date o...,Personal loan,19
1823677,"I made a purchase of {$500.00} on XXXX XXXX, 2...",Credit card or prepaid card,44
1823680,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who...",Mortgage,331
1823681,I can not get from chase who services my mortg...,Mortgage,21


In [10]:
df['narrative']

1          transworld systems inc. \nis trying to collect...
3          Over the past 2 weeks, I have been receiving e...
6          Pioneer has committed several federal violatio...
8          Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...
9          Hello This complaint is against the three cred...
                                 ...                        
1823676    1 : Mailing Address is incorrect. \n2 : Date o...
1823677    I made a purchase of {$500.00} on XXXX XXXX, 2...
1823680    On XXXX XXXX, 2015, I contacted XXXX XXXX, who...
1823681    I can not get from chase who services my mortg...
1823682    cfbp i would Like to file a complaint on Exper...
Name: narrative, Length: 597874, dtype: object

Data Cleaning inspired by:
https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

In [8]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [9]:
def build_vocab(sentences, verbose=True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable= (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [12]:
sentences = df['narrative'].progress_apply(lambda x: str(x).split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 597874/597874 [03:41<00:00, 2702.35it/s] 
100%|██████████| 597874/597874 [02:23<00:00, 4165.00it/s]


{'transworld': 65, 'systems': 3213, 'inc.': 290, 'is': 1054633, 'trying': 67193}


In [14]:
from gensim.models import KeyedVectors

In [10]:
news_path = 'GoogleNews-vectors-negative300.bin.gz'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [35]:
embedding_index

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1b38c74f3a0>

In [33]:
embedding_index = KeyedVectors.load('vectors.kv')

MemoryError: Unable to allocate 3.35 GiB for an array with shape (900000000,) and data type float32

In [16]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [19]:
oov = check_coverage(vocab, vectors)

100%|██████████| 432682/432682 [01:00<00:00, 7110.20it/s] 


Found embeddings for 18.68% of vocab
Found embeddings for  79.65% of all text


In [29]:
import re
import string

def text_cleaning(text):
    #STOPWORDS = set(stopwords.words('english'))
    '''Turn the text into all lowercase, remove brackets, 
    punctuation, and numbers, and removing stopwords'''
    #text = text.lower()
    #text = re.sub('[/(){}\[\]\|@,;?!\.$]', '', text)
    #text = re.sub('[^0-9a-z #+_]', '', text)
    #text = re.sub('[&]', ' & ')
    #text = re.sub('\w*\d\w*', '', text)
    #combine_whitespace = re.compile(r"\s+")
    #text = combine_whitespace.sub(" ", text).strip()
    #text = ' '.join(text.split())


    #text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    return text

In [23]:
df['new'] = df['narrative'].progress_apply(clean_text)

100%|██████████| 597874/597874 [10:23<00:00, 959.50it/s] 


In [30]:
df['new'] = df['new'].progress_apply(text_cleaning)

100%|██████████| 597874/597874 [00:11<00:00, 52504.60it/s] 


In [31]:
sentences = df['new'].progress_apply(lambda x: str(x).split())
vocab = build_vocab(sentences)

100%|██████████| 597874/597874 [06:00<00:00, 1658.31it/s] 
100%|██████████| 597874/597874 [02:34<00:00, 3862.31it/s]


In [23]:
import re

In [24]:
def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [29]:
df['new'] = df['narrative'].progress_apply(clean_text)
sentences = df['new'].progress_apply(lambda x: str(x).split())
vocab = build_vocab(sentences)

100%|██████████| 597874/597874 [02:33<00:00, 3903.61it/s]
100%|██████████| 597874/597874 [10:16<00:00, 969.02it/s]  
100%|██████████| 597874/597874 [03:34<00:00, 2783.98it/s]


In [30]:
print({k: vocab[k] for k in list(vocab)[:10]})

{'transworld': 69, 'systems': 4072, 'inc': 857, 'is': 1063715, 'trying': 67432, 'to': 3787094, 'collect': 40642, 'a': 2189136, 'debt': 290977, 'that': 1735365}


In [37]:
oov = check_coverage(vocab,embedding_index)

100%|██████████| 211466/211466 [00:44<00:00, 4776.76it/s] 

Found embeddings for 39.93% of vocab
Found embeddings for  90.09% of all text





In [38]:
oov[:100]

[('to', 3787094),
 ('and', 3041890),
 ('a', 2189136),
 ('of', 1954694),
 ('Navient', 30851),
 ('didnt', 16591),
 ('cancelled', 14898),
 ('judgement', 6798),
 ('wasnt', 5783),
 ('equifax', 5339),
 ('Shellpoint', 5154),
 ('Coinbase', 5021),
 ('experian', 4326),
 ('Seterus', 4207),
 ('EXPERIAN', 4191),
 ('COVID', 4035),
 ('doesnt', 3976),
 ('transunion', 3589),
 ('PSLF', 3147),
 ('Comenity', 3140),
 ('VALIDATION', 3108),
 ('FedLoan', 2914),
 ('onXX', 2898),
 ('BUREAUS', 2670),
 ('isnt', 2588),
 ('XXXXI', 2568),
 ('TRANSUNION', 2559),
 ('Covid', 2128),
 ('Loancare', 2123),
 ('acknowledgement', 1978),
 ('LVNV', 1922),
 ('tradeline', 1880),
 ('XXXXand', 1753),
 ('cfpb', 1691),
 ('XXXX####', 1533),
 ('navient', 1423),
 ('XXXXDispute', 1314),
 ('XXXX##', 1302),
 ('cancelling', 1257),
 ('coinbase', 1253),
 ('Fedloan', 1228),
 ('OnXX', 1226),
 ('QWR', 1224),
 ('shouldnt', 1171),
 ('VERIFIABLE', 1170),
 ('covid', 1167),
 ('NAVIENT', 1166),
 ('Inquired', 1155),
 ('NewRez', 1143),
 ('DEPTEDXXXX', 1

In [35]:
for i in range(10):
    print(embeddings_index.index2entity[i])

</s>
in
for
that
is
on
##
The
with
said


In [21]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
df['new'] = df['new'].progress_apply(lambda x: clean_numbers(x))
sentences = df['new'].progress_apply(lambda x: str(x).split())
vocab = build_vocab(sentences)

100%|██████████| 597874/597874 [02:23<00:00, 4180.05it/s]
  9%|▊         | 51021/597874 [00:02<00:27, 19997.00it/s]

In [None]:
oov = check_coverage(vocab,embeddings_index)

In [41]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'wasnt':'was not',
                'judgement':'judgment',
                'isnt': 'is not',
                'cancelled':'canceled',
                'cancelling':'canceling',
                'hasnt':'has not',
                'dont':'do not'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [42]:
df['new'] = df['new'].progress_apply(replace_typical_misspell)

100%|██████████| 597874/597874 [00:37<00:00, 15951.60it/s]


In [43]:
sentences = df['new'].progress_apply(lambda x: str(x).split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 597874/597874 [13:44<00:00, 725.53it/s]   
100%|██████████| 597874/597874 [03:55<00:00, 2539.30it/s]
100%|██████████| 597874/597874 [03:22<00:00, 2951.30it/s]


In [45]:
oov = check_coverage(vocab,embedding_index)

100%|██████████| 211433/211433 [00:41<00:00, 5126.66it/s]


Found embeddings for 39.93% of vocab
Found embeddings for  99.49% of all text


In [40]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 157740/157740 [00:19<00:00, 7933.70it/s] 


Found embeddings for 32.33% of vocab
Found embeddings for  99.01% of all text


In [46]:
df.to_csv(r'new_complaints3.csv', index = False)

In [47]:
df

Unnamed: 0,narrative,Product,word_count,new
1,transworld systems inc. \nis trying to collect...,Debt collection,18,transworld systems inc \nis trying to collect ...
3,"Over the past 2 weeks, I have been receiving e...",Debt collection,78,Over the past 2 weeks I have been receiving ex...
6,Pioneer has committed several federal violatio...,Debt collection,152,Pioneer has committed several federal violatio...
8,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...","Credit or consumer reporting, credit repair se...",171,Previously on XX XX XXXX XX XX XXXX and XX XX ...
9,Hello This complaint is against the three cred...,"Credit or consumer reporting, credit repair se...",428,Hello This complaint is against the three cred...
...,...,...,...,...
1823676,1 : Mailing Address is incorrect. \n2 : Date o...,Personal loan,19,1 Mailing Address is incorrect \n2 Date of B...
1823677,"I made a purchase of {$500.00} on XXXX XXXX, 2...",Credit card or prepaid card,44,I made a purchase of ##### on XXXX XXXX #### u...
1823680,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who...",Mortgage,331,On XXXX XXXX #### I contacted XXXX XXXX who is...
1823681,I can not get from chase who services my mortg...,Mortgage,21,I can not get from chase who services my mortg...


In [48]:
oov[:100]

[('Navient', 30851),
 ('equifax', 5339),
 ('Shellpoint', 5154),
 ('Coinbase', 5021),
 ('experian', 4326),
 ('Seterus', 4207),
 ('EXPERIAN', 4191),
 ('COVID', 4035),
 ('transunion', 3589),
 ('PSLF', 3147),
 ('Comenity', 3140),
 ('VALIDATION', 3108),
 ('FedLoan', 2914),
 ('onXX', 2898),
 ('BUREAUS', 2670),
 ('XXXXI', 2568),
 ('TRANSUNION', 2559),
 ('Covid', 2128),
 ('Loancare', 2123),
 ('acknowledgement', 1978),
 ('LVNV', 1922),
 ('tradeline', 1880),
 ('XXXXand', 1753),
 ('cfpb', 1691),
 ('XXXX####', 1533),
 ('navient', 1423),
 ('XXXXDispute', 1314),
 ('XXXX##', 1302),
 ('coinbase', 1253),
 ('Fedloan', 1228),
 ('OnXX', 1226),
 ('QWR', 1224),
 ('VERIFIABLE', 1170),
 ('covid', 1167),
 ('NAVIENT', 1166),
 ('Inquired', 1155),
 ('NewRez', 1143),
 ('DEPTEDXXXX', 1123),
 ('Dovenmuehle', 1087),
 ('tradelines', 1060),
 ('inXX', 1043),
 ('learnmore', 1033),
 ('NationStar', 1004),
 ('DEROGATORY', 998),
 ('VALIDITY', 875),
 ('fcra', 849),
 ('Netspend', 842),
 ('SYNCB', 824),
 ('Amerihome', 811),
 ('