In [None]:
# 1. Import the requests library
import pathlib
import zipfile
import wget
import os
import requests
import pandas

In [None]:
%%capture
!pip install wget
!pip install os
!pip install zipfile
!pip install pathlib

In [None]:
# 2. Set the path for the download: Usage of Consumer complaint database to walk you through the process. (https://www.consumerfinance.gov/data-research/consumer-complaints/)
URL = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"

In [None]:
# 3. download the data behind the URL
response = requests.get(URL)

In [None]:
# 4. Open the response into a new file called complaints.csv.zip
open("complaints.csv.zip", "wb").write(response.content)

In [None]:
# 5. Verify the path and list the existing variables types, and files
directory = os.getcwd()
arr = os.listdir(directory)
print("The variable, arr is of type:", type(arr))
print("The variable, directory is of type:", type(directory))
print("Directory '% s' created" % directory)
print("List '% s' created" % arr)

In [None]:
# 6. Unzip the downloaded file and verify that the file was unzipped
filepath = directory + "/complaints.csv.zip"
with zipfile.ZipFile(filepath, 'r') as zip_ref:
    zip_ref.extractall(directory)
arr = os.listdir(directory)
csv_files = list(pathlib.Path(directory).glob('*.csv'))

print("Directory '% s' created" % directory)
print("List '% s' created" % arr)
print("Csv files '% s' created" % csv_files)


In [None]:
# 7. Reduce model training time and quick analysis using "frac". (https://en.wikipedia.org/wiki/Fractional_part)
filepath = directory + "/complaints.csv"
complaint_df = pandas.read_csv(filepath, error_bad_lines=False)
complaint_df = complaint_df.sample(frac=0.02)

In [None]:
# 8. Look at all of the product groups that are available in the data set because these are the classes that the classifier should predict from a given complaint text.
complaint_df['Product'].value_counts()

In [None]:
# 9. Filter on the Product categories with a relevant number of samples and remove any other product category from further analysis because many classification algorithms work best if the training samples are equally split across the classes. If the data is unbalanced, algorithms might decide to favor classes with many samples to achieve an overall good result.
train_test_df = complaint_df[(complaint_df['Product'] == 'Credit reporting, credit repair services, or other personal consumer reports') | \
                             (complaint_df['Product'] == 'Debt collection') | \
                             (complaint_df['Product'] == 'Mortgage') | \
                             (complaint_df['Product'] == 'Credit card or prepaid card') | \
                             (complaint_df['Product'] == 'Checking or savings account')
                            ]

In [None]:
# 10. List the first 5 test entries for the training
train_test_df.head(5)

In [None]:
# 11. Split the data into training and test data (ratio: 80/20).
# 80% training data
train_orig_df = train_test_df.groupby('Product').sample(frac=0.8, random_state=6)
print("Training data:")
print("Number of training samples: {}".format(len(train_orig_df)))
print("Samples by product group:\n{}".format(train_orig_df['Product'].value_counts()))

# 20% test data
test_orig_df = train_test_df.drop(train_orig_df.index)
print("\nTest data:")
print("Number of test samples: {}".format(len(test_orig_df)))
print("Samples by product group:\n{}".format(test_orig_df['Product'].value_counts()))

# re-index after sampling
train_orig_df = train_orig_df.reset_index(drop=True)
test_orig_df = test_orig_df.reset_index(drop=True)

In [None]:
# 12. Create the data in a JSON format. The training and test data is written to files. 
def prepare_data(df):
       # only the text column and the target label *Product* are needed
       df_out = df[['Consumer complaint narrative', 'Product']].reset_index (drop=True)
       # rename to the identifiers expected by Watson NLP
       df_out = df_out.rename(columns={"Consumer complaint narrative": "text", 'Product': 'labels'})
       # the label column should be an array (although we have only one label per complaint)
       df_out['labels'] = df_out['labels'].map(lambda label: [label,])
       return df_out

train_df = prepare_data(train_orig_df)
train_file = './train_data.json'
train_df.to_json(train_file, orient='records')

test_df = prepare_data(test_orig_df)
test_file = './test data.json'
test_df.to_json(test_file, orient='records')

json_files = list(pathlib.Path(directory).glob('*.json'))
print("JSON files '% s' created" % json_files)

train_df.head(10)

In [None]:
# 13. Show labels
test_df.explode('labels')