In [38]:
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
import pickle

In [2]:
# Some utility functions
def load_pickle(path):
    with open(path, 'rb') as f:
        object_ = pickle.load(f)
    return object_

def save_pickle(path, file_name, object_):
    file_path = path + '/' + file_name + '.pkl'
    with open(file_path, 'wb') as f:
        pickle.dump(object_, f)
    print('The provided object has been saved to {} as {}'.format(path, file_name + '.pkl'))

## Data Wrangling and EDA

In [3]:
raw_data = pd.read_parquet('../input/df.parquet.gzip', engine='pyarrow')

In [4]:
raw_data.head(5)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-06-13,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,CAPITAL ONE FINANCIAL CORPORATION,PA,186XX,,Consent not provided,Web,2019-06-13,Closed with explanation,Yes,,3274605
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257
2,2019-04-01,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,19067,,Consent not provided,Web,2019-04-01,Closed with explanation,Yes,,3198225
3,2021-11-01,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",GA,31707,,,Web,2021-11-01,In progress,Yes,,4863965
4,2021-11-02,Debt collection,Medical debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Medical Data Systems, Inc.",VA,22033,,,Web,2021-11-02,In progress,Yes,,4866449


### Column of Interest

In [5]:
raw_data.loc[1, 'Consumer complaint narrative']

'I contacted Ally on Friday XX/XX/XXXX after falling behind on payments due to being out of work for a short period of time due to an illness. I chated with a representative after logging into my account regarding my opitions to ensure I protect my credit and bring my account current. \n\nShe advised me that before an extenstion could be done, I had to make a payment in the amount of {$270.00}. I reviewed my finances, as I am playing catch up on all my bills and made this payment on Monday XX/XX/XXXX. This rep advised me, once this payment posts to my account to contact Ally back for an extention or to have a payment deffered to the end of my loan. \n\nWith this in mind, I contacted Ally again today and chatted with XXXX. I explained all of the above and the information I was provided when I chatted with the rep last week. She asked several questions and advised me that a one or two month  extension/deffered payment could be done however partial payment is needed! WHAT? She advised me 

In [6]:
raw_data.loc[1, 'Product']

'Vehicle loan or lease'

Observations:
1) Some complaints (rows) do not contain complaint narrative. 
2) Some sensitive information is masked with XXX. 
3) Some complaint narrative simply states what the issue is with no obvious emotion; some contain strong emotion, mainly anger, towards the product or the customer service. There might be a potential project that digs into the sentiment of the customer complaint. For example, we can have a model to classify the sentiment (anger level) of the complaint to determine the urgency of the issue the customer raised. 

In [7]:
raw_data.shape

(2326246, 18)

In [8]:
complaints_data = raw_data[['Product', 'Consumer complaint narrative']].copy()  # Create a copy to avoid the original data being modified. But if the memory is an issue, please skip this step

In [9]:
# How many complaints do not have narrative? (missing values)
complaints_data.isna().sum()

Product                               0
Consumer complaint narrative    1516903
dtype: int64

In [10]:
# Drop those complaints with missing narratives
complaints_data.dropna(inplace=True)
complaints_data.shape

(809343, 2)

In [11]:
# Simplify the column names
complaints_data.rename({'Product': 'product', 'Consumer complaint narrative': 'complaint'}, 
                       axis=1, inplace=True)

In [12]:
complaints_data.head(1)

Unnamed: 0,product,complaint
1,Vehicle loan or lease,I contacted Ally on Friday XX/XX/XXXX after fa...


### How many products are we classifying?

In [13]:
complaints_data['product'].nunique()

18

In [14]:
complaints_data['product'].unique()

array(['Vehicle loan or lease',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Credit card or prepaid card',
       'Money transfer, virtual currency, or money service', 'Mortgage',
       'Payday loan, title loan, or personal loan', 'Debt collection',
       'Checking or savings account', 'Student loan', 'Consumer Loan',
       'Money transfers', 'Credit card', 'Bank account or service',
       'Credit reporting', 'Prepaid card', 'Payday loan',
       'Other financial service', 'Virtual currency'], dtype=object)

In [15]:
pd.concat([complaints_data['product'].value_counts() / complaints_data.shape[0] * 100, 
           complaints_data['product'].value_counts()], axis=1)

Unnamed: 0,product,product.1
"Credit reporting, credit repair services, or other personal consumer reports",39.101469,316465
Debt collection,19.445526,157381
Mortgage,10.435749,84461
Credit card or prepaid card,7.506582,60754
Checking or savings account,4.619673,37389
Credit reporting,3.902919,31588
Student loan,3.594768,29094
Credit card,2.327567,18838
"Money transfer, virtual currency, or money service",2.29606,18583
Vehicle loan or lease,1.857556,15034


We have 18 categories to classify, some of which are somehow similar, such as payday loans, student loans and customer loans. We can combine these similar categories into one category to reduce the dimension of the target of interest, plus remedy the unbalance issue hidden in this dataset. 

In [16]:
product_map = {'Vehicle loan or lease': 'vehicle_loan',
              'Credit reporting, credit repair services, or other personal consumer reports':'credit_report',
              'Credit card or prepaid card': 'card',
              'Money transfer, virtual currency, or money service': 'money_transfer',
              'virtual currency': 'money_transfer',
              'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'
              }

complaints_data.replace({"product": product_map}, inplace=True)

In [17]:
complaints_data['product'].unique(), complaints_data['product'].nunique()

(array(['vehicle_loan', 'credit_report', 'card', 'money_transfer',
        'mortgage', 'loan', 'debt_collection', 'savings_account', 'others'],
       dtype=object),
 9)

In [18]:
pd.concat([complaints_data['product'].value_counts() / complaints_data.shape[0] * 100, 
           complaints_data['product'].value_counts()], axis=1)

Unnamed: 0,product,product.1
credit_report,43.004388,348053
debt_collection,19.445526,157381
mortgage,10.435749,84461
card,10.013307,81042
savings_account,6.458819,52274
loan,6.265576,50710
money_transfer,2.483002,20096
vehicle_loan,1.857556,15034
others,0.036079,292


In a classification problem, we want to pay close attention to the precision and recall scores when it comes to modeling with an unbalanced dataset. While correctly classifying the majority classes is our focus, we could conduct a cost analysis to investiagte, for example, the average cost of the complaint types labeled as "other" to have an idea of whether correctly predicting this class is important. If so, some tuning tasks are necessary.  

### Time to transform the data into the proper format

In [19]:
# converting to lower case
complaints_data['complaint'] = complaints_data['complaint'].apply(lambda x: x.lower())

Note: this process takes a long time, 15min-ish, so it is always a great practice to save the result for next time use. 

In [27]:
tokens_input_path = '../output/'
tokens_file_name = 'tokens.pkl'

tokens_ = load_pickle(tokens_input_path+tokens_file_name)

In [28]:
tokens_[0]

['i',
 'contacted',
 'ally',
 'on',
 'friday',
 'xx/xx/xxxx',
 'after',
 'falling',
 'behind',
 'on',
 'payments',
 'due',
 'to',
 'being',
 'out',
 'of',
 'work',
 'for',
 'a',
 'short',
 'period',
 'of',
 'time',
 'due',
 'to',
 'an',
 'illness',
 '.',
 'i',
 'chated',
 'with',
 'a',
 'representative',
 'after',
 'logging',
 'into',
 'my',
 'account',
 'regarding',
 'my',
 'opitions',
 'to',
 'ensure',
 'i',
 'protect',
 'my',
 'credit',
 'and',
 'bring',
 'my',
 'account',
 'current',
 '.',
 'she',
 'advised',
 'me',
 'that',
 'before',
 'an',
 'extenstion',
 'could',
 'be',
 'done',
 ',',
 'i',
 'had',
 'to',
 'make',
 'a',
 'payment',
 'in',
 'the',
 'amount',
 'of',
 '{',
 '$',
 '270.00',
 '}',
 '.',
 'i',
 'reviewed',
 'my',
 'finances',
 ',',
 'as',
 'i',
 'am',
 'playing',
 'catch',
 'up',
 'on',
 'all',
 'my',
 'bills',
 'and',
 'made',
 'this',
 'payment',
 'on',
 'monday',
 'xx/xx/xxxx',
 '.',
 'this',
 'rep',
 'advised',
 'me',
 ',',
 'once',
 'this',
 'payment',
 'posts',

In [36]:
tokens[0]

['contacted',
 'ally',
 'friday',
 'xx/xx/xxxx',
 'falling',
 'behind',
 'payments',
 'due',
 'work',
 'short',
 'period',
 'time',
 'due',
 'illness',
 '.',
 'chated',
 'representative',
 'logging',
 'account',
 'regarding',
 'opitions',
 'ensure',
 'protect',
 'credit',
 'bring',
 'account',
 'current',
 '.',
 'advised',
 'extenstion',
 'could',
 'done',
 ',',
 'make',
 'payment',
 'amount',
 '{',
 '$',
 '270.00',
 '}',
 '.',
 'reviewed',
 'finances',
 ',',
 'playing',
 'catch',
 'bills',
 'made',
 'payment',
 'monday',
 'xx/xx/xxxx',
 '.',
 'rep',
 'advised',
 ',',
 'payment',
 'posts',
 'account',
 'contact',
 'ally',
 'back',
 'extention',
 'payment',
 'deffered',
 'end',
 'loan',
 '.',
 'mind',
 ',',
 'contacted',
 'ally',
 'today',
 'chatted',
 'xxxx',
 '.',
 'explained',
 'information',
 'provided',
 'chatted',
 'rep',
 'last',
 'week',
 '.',
 'asked',
 'several',
 'questions',
 'advised',
 'one',
 'two',
 'month',
 'extension/deffered',
 'payment',
 'could',
 'done',
 'however