# Watson NLP custom model for text classification

This is a simplified example from different resources.

## Related resources

This example reuses information from a tutorial called [Text classification using Watson NLP](https://developer.ibm.com/tutorials/text-classification-using-watson-nlp/) on IBM Developer. The tutorial  points to the Juypter notebook called [Classifying customer complaints using Watson NLP](https://github.com/ibm-build-lab/Watson-NLP/blob/main/ML/Text-Classification/Consumer%20complaints%20Classification.ipynb).

# 1. Prepare environment
## 1.1 Import needed libraries

In [None]:
# 1.1 Import the requests library

# OS
import requests
# Layout
import plotly.express as px
import plotly.io as pio
# Data
import json
import pandas as pd
# Watson
import watson_nlp
from watson_nlp.workflows.classification import Ensemble
from watson_core.data_model.streams.resolver import DataStreamResolver
from watson_nlp.blocks.classification.svm import SVM
# Confusion matrix in sklearn
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sn
# Project
from project_lib import Project

In [None]:
%%capture
!pip install wget
!pip install os
!pip install zipfile
!pip install pathlib
!pip install shutil

In [None]:
# OS
import os
import wget
import pathlib
import zipfile
import shutil

## 1.2. Download example data and transform to `csv` format

In [None]:
# 1.2.1 Verify the path and list the existing variable types, and files
directory = os.getcwd()
arr = os.listdir(directory)
print("The variable, arr is of type:", type(arr))
print("The variable, directory is of type:", type(directory))
print("Directory '% s' created" % directory)
print("List '% s' created" % arr)
# 1.2.2 Clean-up existing files
path = directory
for file_name in os.listdir(path):
    # Construct full file path
    file = path + "/" + file_name
    if os.path.isfile(file):
        print('Deleting file:', file)
        os.remove(file)

# 1.2.3 Deleting a non-empty folder
dir_path = directory + "/embedding_use_en_stock"
shutil.rmtree(dir_path, ignore_errors=True)
print("Deleted '%s' directory successfully" % dir_path)
dir_path = directory + "/text_stopwords_classification_ensemble_en_stock"
shutil.rmtree(dir_path, ignore_errors=True)
print("Deleted '%s' directory successfully" % dir_path)
dir_path = directory + "/syntax_izumo_en_stock"
shutil.rmtree(dir_path, ignore_errors=True)
print("Deleted '%s' directory successfully" % dir_path)
dir_path = directory + "/embedding_glove_en_stock"
shutil.rmtree(dir_path, ignore_errors=True)
print("Deleted '%s' directory successfully" % dir_path)

In [None]:
# 1.2.4 Set the path for the download: Usage of Consumer complaint database to walk you through the process. (https://www.consumerfinance.gov/data-research/consumer-complaints/)
# URL = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"
# To avoid data problems, you can use the data available on "ibm.box.com":
URL = "https://ibm.box.com/shared/static/fbs5buv3iix9bbckaw5ojsjf4ntkwvcz.csv" 

In [None]:
# 1.2.5 Download the data behind the URL
response = requests.get(URL)

In [None]:
# 1.2.6 Open the response into a new file called complaints.csv.zip
if (URL == "https://files.consumerfinance.gov/ccdb/complaints.csv.zip" ):
   open("complaints.csv.zip", "wb").write(response.content)
else :
   open("complaints.csv", "wb").write(response.content)

In [None]:
# 1.2.7 Unzip the downloaded file and verify that the file was unzipped
if (URL == "https://files.consumerfinance.gov/ccdb/complaints.csv.zip" ):
    filepath = directory + "/complaints.csv.zip"
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        zip_ref.extractall(directory)
    arr = os.listdir(directory)
    csv_files = list(pathlib.Path(directory).glob('*.csv'))
    print("Directory '% s' created" % directory)
    print("List '% s' created" % arr)
    print("Csv files '% s' created" % csv_files)
else :
    csv_files = list(pathlib.Path(directory).glob('*.csv'))
    print("Csv files '% s' created" % csv_files)


## 1.3. Optimize the example data

In [None]:
# 1.3.1 Reduce model training time and quick analysis using "frac". (https://en.wikipedia.org/wiki/Fractional_part)
filepath = directory + "/complaints.csv"
complaint_df = pd.read_csv(filepath, error_bad_lines=False)

if (URL == "https://files.consumerfinance.gov/ccdb/complaints.csv.zip" ):
    value=0.0005
    complaint_df = complaint_df.sample(frac=value)
    print("Frac '% 1.4f' " % value)
else :
    value=0.02
    complaint_df = complaint_df.sample(frac=value)
    print("Frac '% 1.4f' " % value)

In [None]:
# 1.3.2 Look at all the product groups that are available in the data set because these are the classes that the classifier should predict from a given complaint text.
complaint_df['Product'].value_counts()

In [None]:
# 1.3.3 Filter on the Product categories with a relevant number of samples and remove any other product category from further analysis because many classification algorithms work best if the training samples are equally split across the classes. If the data is unbalanced, algorithms might decide to favor classes with many samples to achieve an overall good result.
train_test_df = complaint_df[(complaint_df['Product'] == 'Credit reporting, credit repair services, or other personal consumer reports') | \
                             (complaint_df['Product'] == 'Debt collection') | \
                             (complaint_df['Product'] == 'Mortgage') | \
                             (complaint_df['Product'] == 'Credit card or prepaid card') | \
                             (complaint_df['Product'] == 'Checking or savings account')
                            ]

In [None]:
# 1.3.4 List the first 5 test entries for the training
train_test_df.head(5)

In [None]:
# 1.3.5 Split the data into training and test data (ratio: 80/20).
# 80% training data
train_orig_df = train_test_df.groupby('Product').sample(frac=0.8, random_state=6)
print("Training data:\n")
print("Number of training samples:\n{}".format(len(train_orig_df)))
print("Samples by product group:\n{}".format(train_orig_df['Product'].value_counts()))

# 20% test data
test_orig_df = train_test_df.drop(train_orig_df.index)
print("\nTest data:\n")
print("Number of test samples:\n{}".format(len(test_orig_df)))
print("Samples by product group:\n{}".format(test_orig_df['Product'].value_counts()))

# re-index after sampling
train_orig_df = train_orig_df.reset_index(drop=True)
test_orig_df = test_orig_df.reset_index(drop=True)

In [None]:
# 1.3.6 Create the data in a JSON format. The training and test data is written to files. 
def prepare_data(df):
       # only the text column and the target label *Product* are needed
       df_out = df[['Consumer complaint narrative', 'Product']].reset_index (drop=True)
       # rename to the identifiers expected by Watson NLP
       df_out = df_out.rename(columns={"Consumer complaint narrative": "text", 'Product': 'labels'})
       # the label column should be an array (although we have only one label per complaint)
       df_out['labels'] = df_out['labels'].map(lambda label: [label,])
       return df_out

train_orig_df.head(10)

In [None]:
train_df = prepare_data(train_orig_df)
# Clean all 'NaN'
train_df.dropna(subset=['text'], how='all', inplace=True)
train_file = directory + "/train_data.json"
train_df.to_json(train_file, orient='records')

test_df = prepare_data(test_orig_df)
# Clean all 'NaN'
test_df.dropna(subset=['text'], how='all', inplace=True)
test_file = directory + "/test_data.json"
test_df.to_json(test_file, orient='records')

json_files = list(pathlib.Path(directory).glob('*.json'))
print("JSON files '% s' created" % json_files)

train_df.head(10)

In [None]:
# 1.3.7 Show labels
train_df.explode('labels')

In [None]:
# 1.3.8 Show labels
test_df.explode('labels')

In [None]:
# 1.3.9 Show distribution
plotly_template = pio.templates["plotly_dark"]
pio.templates["plotly_dark_custom"] = pio.templates["plotly_dark"]

complaints_total_figure = px.bar(test_df.explode('labels')['labels'].value_counts())
complaints_total_figure.update_layout(template=plotly_template,barmode='stack',title_text='Show test dataset', title_x=0.5)
complaints_total_figure.show()

In [None]:
# 1.3.9 Show distribution
plotly_template = pio.templates["plotly_dark"]
pio.templates["plotly_dark_custom"] = pio.templates["plotly_dark"]

complaints_total_figure = px.bar(train_df.explode('labels')['labels'].value_counts())
complaints_total_figure.update_layout(template=plotly_template,barmode='stack',title_text='Show training dataset', title_x=0.5)
complaints_total_figure.show()

# 2. Build the model

In [None]:
# 2.1 Load the syntax model and the USE embeddings because the SVM classifier block depends on the syntax block.

# Syntax Model
syntax_model = watson_nlp.load(watson_nlp.download('syntax_izumo_en_stock'))
# USE Embedding Model
use_model = watson_nlp.load(watson_nlp.download('embedding_use_en_stock'))

In [None]:
# 2.2 Create data streams using several utility methods, 
# because classification blocks expect the training data 
# to be in data streams.

training_data_file = train_file
print ("Training data file '%s'"% train_file)

# Create datastream from training data
data_stream_resolver = DataStreamResolver(target_stream_type=list, expected_keys={'text': str, 'labels': list})
training_data = data_stream_resolver.as_data_stream(training_data_file)

# Create Syntax stream
text_stream, labels_stream = training_data[0], training_data[1]
syntax_stream = syntax_model.stream(text_stream)

use_train_stream = use_model.stream(syntax_stream, doc_embed_style='raw_text')
use_svm_train_stream = watson_nlp.data_model.DataStream.zip(use_train_stream, labels_stream)

In [None]:
# 2.3 Train the classifier.
# This can take several minutes!
svm_model = SVM.train(use_svm_train_stream)

## 2.1 Train an ensemble classification model with Watson NLP

The ensemble model combines three classification models:

* CNN
* SVM with TF-IDF features
* SVM with USE (Universal Sentence Encoder) features

In [None]:
# 2.1.1 Train the ensemble classifier.
# This can take up to a 1 h!

stopwords = watson_nlp.download_and_load('text_stopwords_classification_ensemble_en_stock')

# Train the ensemble classifier. Note: This cell will run for several minutes. 
# To restrict the time, we limited the epochs to train the CNN classifier to 5. 
# This is an optional attribute - if not specified, the default will be 30 epochs.

ensemble_model = Ensemble.train(train_file, 'syntax_izumo_en_stock', 'embedding_glove_en_stock', 'embedding_use_en_stock', stopwords=stopwords, cnn_epochs=5)

## 2.2 Save and download the model to the local machine

>Please select `"Insert project token"` from the menu in the **Jupyter Notebook Editor of Watson Studio**. This will add the code below to the top of your notebook. You must run the code and then return to the current step.

In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
# project = Project(project_id='YOUR_ID', project_access_token='YOUR_TOKEN')
# pc = project.project_context

In [None]:
# 2.2.1 Store and load classification models
# This can take several minutes!

project.save_data('svm_model', data=svm_model.as_file_like_object(), overwrite=True)

In [None]:
project.save_data('ensemble_model', data=ensemble_model.as_file_like_object(), overwrite=True)

Download the `ensemble_model` to your local machine.

* Select your project
* Select `Asset types` -> `Data`
* Select `Download` from the dropdown list for the `ensemble_model` 

>Note: The size of the file will be more than 1 Gig!

## 2.3 Load the models

In [None]:
# 2.3.1 Load svm model
svm_model = watson_nlp.load(project.get_file('svm_model'))

In [None]:
# 2.3.2 Load ensemble model
model = project.get_file('ensemble_model')

In [None]:
ensemble_model_new = watson_nlp.load(model)

# 3. Model evaluation

In [None]:
# 3.1 Create a helper method to run both models on a single complaint and return the predicted product groups of both models.

def predict_product(text):
    # run syntax model first
    syntax_result = syntax_model.run(text)
    # run SVM model on top of syntax result
    svm_preds = svm_model.run(use_model.run(syntax_result, doc_embed_style='raw_text'))
    
    predicted_svm = svm_preds.to_dict()["classes"][0]["class_name"]
    
    ensemble_preds = ensemble_model.run(text)
    predicted_ensemble = ensemble_preds.to_dict()["classes"][0]["class_name"]
    return (predicted_svm, predicted_ensemble)

In [None]:
# 3.2 Run the models on the complete test data.

text_col = 'Consumer complaint narrative'

predictions = test_orig_df[text_col].apply(lambda text: predict_product(text))
test_orig_df.head(10)

In [None]:
predictions_df = pd.DataFrame.from_records(predictions, columns=('Predicted SVM', 'Predicted Ensemble'))
predictions_df.head(10)

In [None]:
result_df = test_orig_df[[text_col, "Product"]].merge(predictions_df, how='left', left_index=True, right_index=True)
result_df.head(10)

# 4. Creating and plotting a confusion matrix

In [None]:
# actual values
actual = result_df['Product']
# predicted values
predicted_svm = result_df['Predicted SVM']

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(actual,predicted_svm,labels=['Credit reporting, credit repair services, or other personal consumer reports',
       'Mortgage', 'Credit card or prepaid card', 'Debt collection',
       'Checking or savings account'])
print('Classification report for SVM classifier: \n',matrix)

In [None]:
predicted_ensemble = result_df['Predicted Ensemble']

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(actual,predicted_ensemble,labels=['Credit reporting, credit repair services, or other personal consumer reports',
       'Mortgage', 'Credit card or prepaid card', 'Debt collection',
       'Checking or savings account'])
print('Classification report for Ensemble classifier: \n',matrix)

In [None]:
SVM_confusion_df = pd.crosstab(result_df['Product'], result_df['Predicted SVM'], rownames=['Actual'], normalize='index')
ensemble_confusion_df = pd.crosstab(result_df['Product'], result_df['Predicted Ensemble'], rownames=['Actual'], normalize='index')

figure, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,7))
# figure, ax1 = plt.subplots(ncols=1, figsize=(7,7))

sn.heatmap(SVM_confusion_df, annot=True, cmap="YlGnBu", ax=ax1, cbar=False)
sn.heatmap(ensemble_confusion_df, annot=True, cmap="YlGnBu", ax=ax2, cbar=False)
ax1.title.set_text("SVM")
ax2.title.set_text("Ensemble")
ax2.set_yticklabels([])

plt.show()