This example reuses information from an example called [Text classification using watson NLP](https://developer.ibm.com/tutorials/text-classification-using-watson-nlp/)

# 1. Prepare environment
## 1.1 Import needed libraries

In [None]:
# 1.1 Import the requests library

# OS
import os
import wget
import pathlib
import zipfile
import requests
# Layout
import plotly.express as px
import plotly.io as pio
# Data
import pandas
# Watson
import watson_nlp
from watson_nlp.workflows.classification import Ensemble
from watson_core.data_model.streams.resolver import DataStreamResolver
from watson_nlp.blocks.classification.svm import SVM

In [None]:
%%capture
!pip install wget
!pip install os
!pip install zipfile
!pip install pathlib

## 1.2. Download example data and transform to `csv` format

In [None]:
# 1.2.1. Set the path for the download: Usage of Consumer complaint database to walk you through the process. (https://www.consumerfinance.gov/data-research/consumer-complaints/)
URL = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"

In [None]:
# 1.2.2. download the data behind the URL
response = requests.get(URL)

In [None]:
# 1.2.3. Open the response into a new file called complaints.csv.zip
open("complaints.csv.zip", "wb").write(response.content)

In [None]:
# 1.2.4. Verify the path and list the existing variables types, and files
directory = os.getcwd()
arr = os.listdir(directory)
print("The variable, arr is of type:", type(arr))
print("The variable, directory is of type:", type(directory))
print("Directory '% s' created" % directory)
print("List '% s' created" % arr)

In [None]:
# 1.2.5. Unzip the downloaded file and verify that the file was unzipped
filepath = directory + "/complaints.csv.zip"
with zipfile.ZipFile(filepath, 'r') as zip_ref:
    zip_ref.extractall(directory)
arr = os.listdir(directory)
csv_files = list(pathlib.Path(directory).glob('*.csv'))

print("Directory '% s' created" % directory)
print("List '% s' created" % arr)
print("Csv files '% s' created" % csv_files)


## 1.3. Optimize the example data

In [None]:
# 1.3.1 Reduce model training time and quick analysis using "frac". (https://en.wikipedia.org/wiki/Fractional_part)
filepath = directory + "/complaints.csv"
complaint_df = pandas.read_csv(filepath, error_bad_lines=False)
complaint_df = complaint_df.sample(frac=0.02)

In [None]:
# 1.3.2 Look at all of the product groups that are available in the data set because these are the classes that the classifier should predict from a given complaint text.
complaint_df['Product'].value_counts()

In [None]:
# 1.3.3 Filter on the Product categories with a relevant number of samples and remove any other product category from further analysis because many classification algorithms work best if the training samples are equally split across the classes. If the data is unbalanced, algorithms might decide to favor classes with many samples to achieve an overall good result.
train_test_df = complaint_df[(complaint_df['Product'] == 'Credit reporting, credit repair services, or other personal consumer reports') | \
                             (complaint_df['Product'] == 'Debt collection') | \
                             (complaint_df['Product'] == 'Mortgage') | \
                             (complaint_df['Product'] == 'Credit card or prepaid card') | \
                             (complaint_df['Product'] == 'Checking or savings account')
                            ]

In [None]:
# 1.3.4 List the first 5 test entries for the training
train_test_df.head(5)

In [None]:
# 1.3.5 Split the data into training and test data (ratio: 80/20).
# 80% training data
train_orig_df = train_test_df.groupby('Product').sample(frac=0.8, random_state=6)
print("Training data:\n")
print("Number of training samples:\n{}".format(len(train_orig_df)))
print("Samples by product group:\n{}".format(train_orig_df['Product'].value_counts()))

# 20% test data
test_orig_df = train_test_df.drop(train_orig_df.index)
print("\nTest data:\n")
print("Number of test samples:\n{}".format(len(test_orig_df)))
print("Samples by product group:\n{}".format(test_orig_df['Product'].value_counts()))

# re-index after sampling
train_orig_df = train_orig_df.reset_index(drop=True)
test_orig_df = test_orig_df.reset_index(drop=True)

In [None]:
# 1.3.6 Create the data in a JSON format. The training and test data is written to files. 
def prepare_data(df):
       # only the text column and the target label *Product* are needed
       df_out = df[['Consumer complaint narrative', 'Product']].reset_index (drop=True)
       # rename to the identifiers expected by Watson NLP
       df_out = df_out.rename(columns={"Consumer complaint narrative": "text", 'Product': 'labels'})
       # the label column should be an array (although we have only one label per complaint)
       df_out['labels'] = df_out['labels'].map(lambda label: [label,])
       return df_out

train_df = prepare_data(train_orig_df)
# Clean all 'NaN'
train_df.dropna(subset=['text'], how='all', inplace=True)
train_file = directory + "/train_data.json"
train_df.to_json(train_file, orient='records')

test_df = prepare_data(test_orig_df)
# Clean all 'NaN'
test_df.dropna(subset=['text'], how='all', inplace=True)
test_file = directory + "/test data.json"
test_df.to_json(test_file, orient='records')

json_files = list(pathlib.Path(directory).glob('*.json'))
print("JSON files '% s' created" % json_files)

train_df.head(10)

In [None]:
# 1.3.7 Show labels
train_df.explode('labels')

In [None]:
# 1.3.8 Show labels
test_df.explode('labels')

In [None]:
# 1.3.9 Show distribution
plotly_template = pio.templates["plotly_dark"]
pio.templates["plotly_dark_custom"] = pio.templates["plotly_dark"]

complaints_total_figure = px.bar(test_df.explode('labels')['labels'].value_counts())
complaints_total_figure.update_layout(template=plotly_template,barmode='stack',title_text='Show test dataset', title_x=0.5)
complaints_total_figure.show()

# 2. Build the model

In [None]:
# 2.1 Load the syntax model and the USE embeddings because the SVM classifier block depends on the syntax block.

# Syntax Model
syntax_model = watson_nlp.load(watson_nlp.download('syntax_izumo_en_stock'))
# USE Embedding Model
use_model = watson_nlp.load(watson_nlp.download('embedding_use_en_stock'))

In [None]:
# 2.2 Create data streams using several utility methods because classification blocks expect the training data to be in data streams.

training_data_file = train_file
print ("Training data file %s", train_file)

# Create datastream from training data
data_stream_resolver = DataStreamResolver(target_stream_type=list, expected_keys={'text': str, 'labels': list})
training_data = data_stream_resolver.as_data_stream(training_data_file)

# Create Syntax stream
text_stream, labels_stream = training_data[0], training_data[1]
syntax_stream = syntax_model.stream(text_stream)

use_train_stream = use_model.stream(syntax_stream, doc_embed_style='raw_text')
use_svm_train_stream = watson_nlp.data_model.DataStream.zip(use_train_stream, labels_stream)

In [None]:
# 2.3 Train the classifier.
# This can take several minutes!
svm_model = SVM.train(use_svm_train_stream)

## 2.1 Train an ensemble classification model with Watson NLP

The ensemble model combines three classification models:

* CNN
* SVM with TF-IDF features
* SVM with USE (Universal Sentence Encoder) features

In [None]:
stopwords = watson_nlp.download_and_load('text_stopwords_classification_ensemble_en_stock')

ensemble_model = Ensemble.train(train_file, 'syntax_izumo_en_stock', 'embedding_glove_en_stock', 'embedding_use_en_stock', stopwords=stopwords)