In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
from preprocess.clean_text import TextClean

In [1]:
def gen_model_data(df, subset=None, features=None, lowercase=True, min_sentence_length=5):
    """
    Drop rows with NaNs in 'Product', 'Issue', 'Date' 
    and 'Consumer complaint narrative' columns.
    """
    df = df.dropna(subset=subset)
    if features:
        df = df[features]
    return df

In [8]:
complaints_file = '../data/Consumer_Complaints.csv'

In [9]:
df = pd.read_csv(complaints_file, low_memory=False)

In [67]:
subset = ['Product', 'Issue', 'Date sent to company', 'Consumer complaint narrative']
df = gen_model_data(df, subset=subset)

### Merge some of the fields together
- Payday loans with the rest of the loans

In [68]:
main_type = 'Payday loan, title loan, or personal loan'
subset_type = 'Payday loan'
subset = df[df.Product == subset_type]
subset['Sub-product'] = subset_type
subset['Product'] = main_type
main_set = df[df.Product == main_type]

loans = pd.concat([subset, main_set])

In [69]:
main_type = 'Credit card or prepaid card'
subset_type = 'Credit card'
subset = df[df.Product == subset_type]
subset['Sub-product'] = subset_type
subset['Product'] = main_type
main_set = df[df.Product == main_type]
credit = pd.concat([subset, main_set])

subset = df[df.Product == 'Prepaid card']
credit = pd.concat([subset, credit])
credit['Product'] = main_type

In [70]:
main_type = 'Consumer Loan'
subset_type = 'Vehicle loan or lease'
vehicles = df[df.Product == subset_type]

loan_types = []
for old, new in [('Loan', 'Vehicle loan'), ('Lease', 'Vehicle lease'), ('Title loan', 'Title loan')]:
    idx = vehicles[(vehicles['Sub-product'] == old)]
    idx['Sub-product'] = new
    loan_types.append(idx)
vehicles = pd.concat(loan_types)

consumer_loans = df[df.Product == main_type]
consumer_loans = pd.concat([vehicles, consumer_loans])
consumer_loans['Product'] = main_type

In [78]:
main_type = 'Money transfer, virtual currency, or money service'
subset_type1 = 'Virtual currency'
subset_type2 = 'Money transfers'
transfer = df[df.Product == main_type]
subset1 = df[df.Product == subset_type1]
subset2 = df[df.Product == subset_type2]

transfer = pd.concat([subset1, subset2, transfer])
transfer['Product'] = main_type

In [79]:
main_type = 'Credit reporting, credit repair services, or other personal consumer reports'
subset_type = 'Credit reporting'
subset = df[df.Product == subset_type]
subset['Sub-product'] = subset_type
subset['Product'] = main_type
main_set = df[df.Product == main_type]
reporting = pd.concat([subset, main_set])

In [80]:
main_type = 'Bank account or service'
subset_type = 'Checking or savings account'
main_set = df[df.Product == main_type]
subset = df[df.Product == subset_type]

banking = pd.concat([main_set, subset])
banking['Product'] = main_type

In [81]:
all_types = df['Product'].unique()
all_types

array(['Credit reporting, credit repair services, or other personal consumer reports',
       'Debt collection', 'Mortgage', 'Checking or savings account',
       'Credit card or prepaid card', 'Vehicle loan or lease',
       'Payday loan, title loan, or personal loan', 'Student loan',
       'Money transfer, virtual currency, or money service',
       'Credit reporting', 'Credit card', 'Bank account or service',
       'Consumer Loan', 'Money transfers', 'Prepaid card', 'Payday loan',
       'Other financial service', 'Virtual currency'], dtype=object)

In [82]:
not_modified = df[df.Product.isin(['Debt collection', 'Mortgage', 'Student loan','Other financial service'])]
merged = pd.concat([not_modified, loans, credit, consumer_loans, transfer, reporting, banking])

In [83]:
len(merged)

434037

In [85]:
merged.to_csv('../data/regrouped.csv', index=False)