## Consumer Complaints Dataset
### 1. Preprocessing
The categories ('Product' field) of the complaints are often subsets of one another, for example data under the product 'Credit card' could also be classified under 'Credit Card, Prepaid Card'. We merge data of this type together to create fewer Products but more Sub-products. We would like to predict sub-products once the initial pass is done.

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import re
import numpy as np
import pandas as pd

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [41]:
def gen_model_data(df, subset=None, features=None):
    """
    Drop rows with NaNs in 'Product', 'Issue', 'Date' 
    and 'Consumer complaint narrative' columns.
    """
    df = df.dropna(subset=subset)
    if features:
        df = df[features]
    return df

In [42]:
complaints_file = '../data/Consumer_Complaints.csv'

In [43]:
df = pd.read_csv(complaints_file, low_memory=False)

In [45]:
subset = ['Product', 'Issue', 'Consumer complaint narrative']
df = gen_model_data(df, subset=subset)

### Merge some of the fields together

In [48]:
df.to_csv('../data/with_narrative.csv', index=False)

In [47]:
df.Product.value_counts()

Credit reporting, credit repair services, or other personal consumer reports    119283
Debt collection                                                                  96750
Mortgage                                                                         57416
Credit reporting                                                                 31588
Credit card or prepaid card                                                      26803
Student loan                                                                     23557
Credit card                                                                      18838
Checking or savings account                                                      16136
Bank account or service                                                          14885
Consumer Loan                                                                     9473
Vehicle loan or lease                                                             6940
Money transfer, virtual currency, or money 

In [49]:
def merge_products(df):
    # Merge Payday loan with other loans
    main_type = 'Payday loan, title loan, or personal loan'
    subset_type = 'Payday loan'
    subset = df[df.Product == subset_type]
    subset['Sub-product'] = subset_type
    subset['Product'] = main_type
    main_set = df[df.Product == main_type]
    loans = pd.concat([subset, main_set])
    
    # Credit card
    main_type = 'Credit card or prepaid card'
    subset_type = 'Credit card'
    subset = df[df.Product == subset_type]
    subset['Sub-product'] = subset_type
    subset['Product'] = main_type
    main_set = df[df.Product == main_type]
    credit = pd.concat([subset, main_set])

    subset = df[df.Product == 'Prepaid card']
    credit = pd.concat([subset, credit])
    credit['Product'] = main_type
    
    # Merge vehicle loan with consumer loans
    main_type = 'Consumer Loan'
    subset_type = 'Vehicle loan or lease'
    vehicles = df[df.Product == subset_type]

    loan_types = []
    for old, new in [('Loan', 'Vehicle loan'), ('Lease', 'Vehicle lease'), ('Title loan', 'Title loan')]:
        idx = vehicles[(vehicles['Sub-product'] == old)]
        idx['Sub-product'] = new
        loan_types.append(idx)
    vehicles = pd.concat(loan_types)

    consumer_loans = df[df.Product == main_type]
    consumer_loans = pd.concat([vehicles, consumer_loans])
    consumer_loans['Product'] = main_type
    
    
    # Merge virtual currency and Money transfer
    main_type = 'Money transfer, virtual currency, or money service'
    subset_type1 = 'Virtual currency'
    subset_type2 = 'Money transfers'
    transfer = df[df.Product == main_type]
    subset1 = df[df.Product == subset_type1]
    subset2 = df[df.Product == subset_type2]

    transfer = pd.concat([subset1, subset2, transfer])
    transfer['Product'] = main_type
    
    # Reporting
    main_type = 'Credit reporting, credit repair services, or other personal consumer reports'
    subset_type = 'Credit reporting'
    subset = df[df.Product == subset_type]
    subset['Sub-product'] = subset_type
    subset['Product'] = main_type
    main_set = df[df.Product == main_type]
    reporting = pd.concat([subset, main_set])
    
    # Banking service
    main_type = 'Bank account or service'
    subset_type = 'Checking or savings account'
    main_set = df[df.Product == main_type]
    subset = df[df.Product == subset_type]

    banking = pd.concat([main_set, subset])
    banking['Product'] = main_type
    
    not_modified = df[df.Product.isin(['Debt collection', 'Mortgage', 'Student loan','Other financial service'])]
    return pd.concat([not_modified, loans, credit, consumer_loans, transfer, reporting, banking])

In [50]:
merged = reset_labels(df)

In [51]:
merged['Product'].value_counts().keys()

Index(['Credit reporting, credit repair services, or other personal consumer reports',
       'Debt collection', 'Mortgage', 'Credit card or prepaid card',
       'Bank account or service', 'Student loan', 'Consumer Loan',
       'Money transfer, virtual currency, or money service',
       'Payday loan, title loan, or personal loan', 'Other financial service'],
      dtype='object')

In [52]:
abbrev_map = {'Credit reporting, credit repair services, or other personal consumer reports': 'CR',
                'Debt collection': 'DC', 
                'Mortgage': 'MO', 
                'Credit card or prepaid card': 'CC',
                'Bank account or service': 'BS', 
                'Student loan': 'SL', 
                'Consumer Loan': 'CL', 
                'Money transfer, virtual currency, or money service' :'MT', 
                'Payday loan, title loan, or personal loan': 'PL', 
                'Other financial service': 'OT'}

In [53]:
def map_abbrev(val):
    return abbrev_map[val]

In [54]:
merged['Abbrev'] = merged['Product'].apply(map_abbrev)

In [55]:
merged['Product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    150871
Debt collection                                                                  96750
Mortgage                                                                         57416
Credit card or prepaid card                                                      47091
Bank account or service                                                          31021
Student loan                                                                     23557
Consumer Loan                                                                    16413
Money transfer, virtual currency, or money service                                8136
Payday loan, title loan, or personal loan                                         7113
Other financial service                                                            292
Name: Product, dtype: int64

In [56]:
merged.to_csv('../data/product_merged.csv', index=False)