In [None]:
import os
import pandas as pd
import numpy as np
import spacy
from spacy.util import minibatch, compounding
import random
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp
from sklearn.metrics import f1_score
import re
from tqdm.notebook import tqdm
from tabulate import tabulate
from multiprocessing import Pool
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot
from sklearn.metrics import roc_curve
from sklearn.metrics import cohen_kappa_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

In [None]:
data_path = r''
training_data = 'case_study_data.csv'

In [None]:
td = pd.read_csv(os.path.join(data_path, training_data))
td.shape
td.head()

In [None]:
def preprocess_docs(doc):
    remove_rule_patterns = r"([\s\W]?[xX]{4}[\s\W]?)|([\s\.]?[xX]{2}/[xX]{2}/[xX]{2,4}[\s\.]?)|([\s\W]?[\(\)\{\}\,\.][\s\W]?)"
    s1 = re.sub(remove_rule_patterns, ' ', doc.lower())
    return re.sub(r'\s+', ' ', s1)

In [None]:
def  preprocess_tuple_list(tuplist):
    
    docs, labels = zip(*tuplist)
    
    vfunc = np.vectorize(preprocess_docs)
    pp_doc = list(vfunc(np.array(docs)))
    
    return list(zip(pp_doc, labels))

In [None]:
def set_binary_cat(product, target_product):
    dict_template = {}
    if product==target_product:
        dict_template['Y']=True
        dict_template['N']=False
    else:
        dict_template['Y']=False
        dict_template['N']=True
    return {'cats': dict_template}

def assign_category(df, doc_column, label_column):
    temp_df = df.copy()
    labels = list(set(temp_df[label_column]))
    cat_dict = dict(zip(labels, [False] * len(labels)))
    temp_df['cats'] = temp_df[label_column].apply(lambda x: set_binary_cat(x, target_product))
    tuple_output = list(zip(temp_df[doc_column], temp_df['cats']))
    del temp_df
    return tuple_output

In [None]:
def load_data(df, target_product, limit, split):
    # Data filtered for the target product to build the classifier for.
    X_prod_i = df[df['product_group']==target_product].copy()
    X_prod_j = df[df['product_group']!=target_product].copy()

    #X_prod_i.product_group.value_counts()
    #X_prod_j.product_group.value_counts()

    # Under sample the Majority class which are complaints submitted for the 
    # product_label that is not being classified. Obtain a subset of X_prod_j
    # the size of X_prod_i stratified by product_label to ensure that the 
    # distributions of the product_label is preserved in the Undersampled set.
    X_prod_j_ss, X_disregard, y_prod_j_ss, y_disregard = train_test_split(
                                                            X_prod_j.text, 
                                                            X_prod_j.product_group, 
                                                            train_size=len(X_prod_i), 
                                                            test_size=0.3
                                                        )

    D_prod_j = X_prod_j.product_group.value_counts(normalize=True)
    D_prod_j_train = y_prod_j_ss.value_counts(normalize=True)
    test = ks_2samp(D_prod_j, D_prod_j_train, alternative='two-sided', mode='asymp')
    if test.pvalue < significance:
        print('Product label distribution not preserved in subset!!!')

    y_cats_i = [set_binary_cat(product_group, target_product) for product_group in list(X_prod_i.product_group)]
    Xy_i = list(zip(X_prod_i.text, y_cats_i))
    #Xy_i[0:10]
    y_cats_j = [set_binary_cat(product_group, target_product) for product_group in list(y_prod_j_ss)]
    Xy_j = list(zip(X_prod_j_ss, y_cats_j))
    #Xy_j[0:10]

    Xy_agg = Xy_i + Xy_j

    Xy_agg_filtered = random.sample(Xy_agg, limit)
    train_size = int(len(Xy_agg_filtered)*split)

    return Xy_agg_filtered[:train_size], Xy_agg_filtered[train_size:]

# Construct Classifer

In [None]:
def train_model(
    training_data: list,
    test_data: list,
    iterations: int,
    model_architecture: str
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_architecture}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label('Y')
    textcat.add_label('N')
    
    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print('='*72)
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )
    
    # Save model
    version = 1
    model_dir_name = f".\\models\\{target_product}_{model_architecture}_model_artifacts_v{version}"
    if os.path.isdir(model_dir_name):
        while os.path.isdir(model_dir_name):
            version +=1
            model_dir_name = f".\\models\\{target_product}_{model_architecture}_model_artifacts_v{version}"
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(model_dir_name)

In [None]:
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "N"
            ):
                continue
            if score >= 0.5 and true_label["Y"]:
                true_positives += 1
            elif score >= 0.5 and true_label["N"]:
                false_positives += 1
            elif score < 0.5 and true_label["N"]:
                true_negatives += 1
            elif score < 0.5 and true_label["Y"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

# Model Tests

In [None]:
def manual_product_model_test(input_data, target_product, version=1):
    """
    Takes a string input of a complaint from the CFPB and will
    score the complaint using the specified product model version#.
    Output will be the a print of the prediction value ("Y" or "N") 
    along with the corresponding model score.
    
    Argumenst:
    input: str
    target_product: str
    version: int
    """
    #  Load saved trained model
    model_directory = r'C:\Users\vince\Projects\wf_case_study\data\models'
    product_models = os.listdir(model_directory)
    loaded_models = [spacy.load(os.path.join(model_directory, model))
                     for model in product_models 
                     if ((model[:len(target_product)] == target_product) & (model[-1:]==str(version)))
                    ]
    
    for model in loaded_models:
        # Generate prediction
        parsed_text = model(input_data)
        # Determine prediction to return
        if parsed_text.cats["Y"] > parsed_text.cats["N"]:
            prediction = "Y"
            score = parsed_text.cats["Y"]
        else:
            prediction = "N"
            score = parsed_text.cats["N"]
        print(f"Target product: {target_product}")
        print(
            f"Review text: {input_data}\nComplaint is for {target_product} product?: {prediction}"
            f"\tScore: {score}"
        )

In [None]:
def manual_model_test(input_data, version=1):
    """
    Takes a string input of a complaint from the CFPB and will
    score the complaint using all product models for the specified version#.
    Output will be the a print of the final predicted product for the 
    complaint was filed against. This is defined to be the highest predicted
    score among all model scores by product.
    
    Argumenst:
    input: str
    version: int
    """
    #  Load saved trained model
    model_directory = r'C:\Users\vince\Projects\wf_case_study\data\models'
    product_models = [model for model in os.listdir(model_directory) if model[-1:]==str(version)]
    loaded_models = [spacy.load(os.path.join(model_directory, model))
                     for model in product_models
                    ]
    print(f'Review text: \n{input_data}')
    
    table_headers = ['Product Classifier', 'Prediction', 'Score']
    
    iter_list = []
    i=0
    prediction_dict = {}
    for model in loaded_models:
        intra_iter_list = []
        m = re.match(f"(\w+)_simple_cnn_model_artifacts_v\d", product_models[i])
        if m:
            current_product = m.group(1)
            intra_iter_list.append(current_product)
            
        prediction_dict[current_product] = {}
        
        # Generate prediction
        parsed_text = model(input_data)
        # Determine prediction to return
        if parsed_text.cats["Y"] > parsed_text.cats["N"]:
            prediction = "Y"
            score = parsed_text.cats["Y"]
        else:
            prediction = "N"
            score = -1*parsed_text.cats["N"]
        
        prediction_dict[current_product]['prediction'] = prediction
        prediction_dict[current_product]['score'] = score
        
        intra_iter_list.append(prediction)
        intra_iter_list.append(score)
        
        iter_list.append(intra_iter_list)
        i+=1
    
    candidate_predictions = {product: prod_dict['score']
                             for product, prod_dict in prediction_dict.items() 
                             if prod_dict['prediction']=='Y'
                            }
    highest_score_idx = list(candidate_predictions.values()).index(max(list(candidate_predictions.values())))
    predicted_product = list(candidate_predictions.keys())[highest_score_idx]
    print('='*51)
    print(tabulate(iter_list, headers=table_headers, tablefmt='orgtbl'))
    print(f"\n!!!!!!!!!!========== AND THE WINNER IS!!!!! ==============!!!!!!!!!!\n\n{' '*21}{predicted_product}\n")

In [None]:
def prediction_df2(loaded_model, input_text, current_product, preprocess_doc=False):
    
    if preprocess_doc:
        input_doc = str(preprocess_docs(input_text))
    else:
        input_doc = str(input_text)
    # Generate prediction
    parsed_text = loaded_model(input_doc)
    # Determine prediction to return
    
    if parsed_text.cats["Y"] > parsed_text.cats["N"]:
        prediction = current_product
        score = parsed_text.cats["Y"]
    else:
        prediction = "other_product"
        score = parsed_text.cats["N"]
    
    return prediction, score

In [None]:
def score_df(df, text_column, version=1, preprocess_doc=False):
    """
    Takes a dataframe as input as well as the name of the column that contains
    consumer complaints from the CFPB and will append columns containing the 
    predicted value for each product binary classification model along with 
    the score along with a column for the final predicted product that the
    complaint corresponds to.
    
    Output is the original dataframe along with indicividual binary product
    classification prediction and scores and final prediction label.
    
    Argumenst:
    input: str
    version: int
    """
    #  Load saved trained models
    model_directory = r'C:\Users\vince\Projects\wf_case_study\data\models'
    product_models = [model for model in os.listdir(model_directory) if model[-1:]==str(version)]
    loaded_models = [spacy.load(os.path.join(model_directory, model))
                     for model in product_models
                    ]
    
    # Load copy of data frame and remove any rows where the input text to be
    # scored is null.
    temp_df = df.copy()
    temp_df = temp_df[temp_df[text_column].notnull()]
    n = len(temp_df)
    
    products = []
    i=0
    vfunc = np.vectorize(prediction_df2)
    text_array = np.array(temp_df['text'])
    
    for model in tqdm(loaded_models):
        # Fetch current product for which the model is predicting.
        m = re.match(f"(\w+)_simple_cnn_model_artifacts_v\d", product_models[i])
        if m:
            current_product = m.group(1)
            products.append(current_product)
            #print(current_product)
          
        print(f'version: {version}\ncurrent_product: {current_product}')
        
        tuples = vfunc(model, text_array, current_product,preprocess_doc)
        temp_df[f'{current_product}_pred'] = list(tuples[0])
        temp_df[f'{current_product}_score'] = list(tuples[1])

        i+=1
    
    score_cols = [col for col in temp_df.columns if col[-6:]=='_score']
    temp_df['predicted_product'] = temp_df[score_cols].idxmax(axis=1)
    temp_df['predicted_product'] = temp_df['predicted_product'].apply(lambda x: x[:-6])
    
    return temp_df

In [None]:
def determine_thresholds(scored_df):
    """
    Used to determine the optimal classification thresholds for a scored dataframe.
    """
    dfc = scored_df.copy()
    
    products = ['bank_service', 'credit_reporting', 'mortgage', 'money_transfers','credit_card',
                     'loan', 'debt_collection']
    
    threshold_dict = {}
    for product in products:
        
        dfc['true_label'] = np.where(dfc['product_group']==dfc[f'{product}_pred'], 1, 0)
        y_true = list(dfc['true_label'])
        
        dfc['pred_score'] = np.where(dfc[f'{product}_score'] < 0, dfc[f'{product}_score'] + 1, dfc[f'{product}_score'])
        y_hat = list(dfc['pred_score'])
        
        fpr, tpr, thresholds = roc_curve(y_true, y_hat)
        gmeans = np.sqrt(tpr * (1-fpr))
        ix = np.argmax(gmeans)
        
        threshold_dict[product] = {'threshold': thresholds[ix], 'tpr': tpr, 'fpr': fpr, 'thresholds':thresholds }
    
    return threshold_dict

In [None]:
def roc_curve_generator(thresh_dict, product):
       
    # plot the roc curve for the model
    pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
    pyplot.plot(thresh_dict[product]['fpr'], thresh_dict[product]['tpr'], marker='.', label=product)
    #pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
    # axis labels
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.legend()
    # show the plot
    pyplot.show()

In [None]:
def performance_metrics(df, threshold_dict):
    dfc = df.copy()
    products = ['bank_service', 'credit_reporting', 'mortgage', 'money_transfers',
                'credit_card', 'loan', 'debt_collection'
               ]
    
    columns = [f'{prod}_score' for prod in products] #+ ['predicted_product']
    
    total_iter_levels = len(dfc)*len(columns)
    table_headers = ['Product Classifer', 'f1', 'Precision', 'Recall', 'TP', 'FP', 'FN', 'TN']
    iter_list = []
    
    pbar = tqdm(total=total_iter_levels)
    for col in columns:
        product = col[:-6]
        true_labels = list(dfc['product_group'])
        predicted_score = list(dfc[col])
        threshold = threshold_dict[product]['threshold']
        
        labels=  list(zip(true_labels, predicted_score))
        
        tp, tn, fp, fn = 0, 0, 1e-8, 1e-8
        for label in labels:
            if ((label[0]==product) & (label[1]>= threshold)):
                tp+=1
            elif label[0] == product:
                fn+=1
            elif label[1] >= threshold:
                fp+=1
            else:
                tn +=1
            pbar.update(1)
        
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        
        iter_list.append([col[:-6], f1, precision, recall, tp, int(fp), int(fn), tn])
    
    val_counts = dfc.product_group.value_counts()
    print(f'Dataframe size: {len(dfc)}\nDistribution of product_group:\n{dfc.product_group.value_counts()}')
    #print(tabulate(iter_list, headers=['Product Group', 'Frequency', 'Relative %'], tablefmt='orgtbl'))
    print('='*50)
    print(tabulate(iter_list, headers=table_headers, tablefmt='orgtbl'))
    pbar.close()

In [None]:
def plot_confusion_matrix(classifers,
                          target_names,
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.figure(figsize=(25, 15))
    for i, (key, val) in enumerate(classifers.items()):
        cm = val
        title = key
        plt.subplot(1, len(classifers), i+1)
        accuracy = np.trace(cm) / float(np.sum(cm))
        misclass = 1 - accuracy
        
        if cmap is None:
            cmap = plt.get_cmap('Blues')
        
        
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        #plt.colorbar()
        
        if target_names is not None:
            tick_marks = np.arange(len(target_names))
            plt.xticks(tick_marks, target_names, rotation=45)
            plt.yticks(tick_marks, target_names)
        
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        
        thresh = cm.max() / 1.5 if normalize else cm.max() / 2
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            if normalize:
                plt.text(j, i, "{:0.3f}".format(cm[i, j]),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black")
            else:
                plt.text(j, i, "{:,}".format(cm[i, j]),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black")
        
        
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label\naccuracy={:0.3f}; misclass={:0.3f}'.format(accuracy, misclass))
        
    plt.show()

# Train Model

In [None]:
# Training Parameters
project_directory = r'C:\Users\vince\Projects\wf_case_study\data'
#target_product = 'bank_service'
volume_limit = 1500
training_iterations = 3
model_architecture = 'simple_cnn'
significance = 0.05 # Significance for test of Distribution of product_label for undersampled majority class. (load_data)
split = 0.7 # Percentage of records to be allocated to training set.

In [None]:
os.chdir(project_directory)
for product in products:
    target_product = product
    print(f"working_directory: {project_directory}\nProduct being trained: {target_product}\nTrain+Test size limit: {volume_limit}\nTrain\\Test split:{split}\nModel Training Iterations: {training_iterations}")
    training_data, test_data = load_data(td, target_product, limit=volume_limit, split=split)
    train_model(training_data, test_data, iterations=training_iterations, model_architecture=model_architecture)

In [None]:
# Training Parameters
project_directory = r'C:\Users\vince\Projects\wf_case_study\data'
#target_product = 'bank_service'
volume_limit = 5000
training_iterations = 5
model_architecture = 'simple_cnn'
significance = 0.05 # Significance for test of Distribution of product_label for undersampled majority class. (load_data)
split = 0.7 # Percentage of records to be allocated to training set.

In [None]:
os.chdir(project_directory)
for product in products:
    target_product = product
    print(f"working_directory: {project_directory}\nProduct being trained: {target_product}\nTrain+Test size limit: {volume_limit}\nTrain\\Test split:{split}\nModel Training Iterations: {training_iterations}")
    training_data, test_data = load_data(td, target_product, limit=volume_limit, split=split)
    train_model(training_data, test_data, iterations=training_iterations, model_architecture=model_architecture)
    #print("Testing model")
    #test_model()

In [None]:
TEST_STRING_BS = """I opened a Bank of the the West account. The account came with a promotion. 
The promotion was get {$100.00} for setting up a qualifying direct deposit. I have received 
that promotion. The other promotion was, " get {$50.00} for making XXXX debit card purchases 
each month for 2 months. '' I have met the terms for the {$50.00} promotion and have not 
received the promotion that I am entitled to. I want the {$50.00} that I am entitled to that 
Bank of the West has not given me."""

In [None]:
TEST_STRING_CC = """I HAVE HAD AN ACCOUNT WITH DISCOVER CARD SINCE 2011. I HAVE PAID THEM AS AGREED MONTHLY EVER SINCE. HOWEVER, NOW THEY HAVE DECIDED THAT THEY WILL NOT GRANT ME CREDIT ANY LONGER? I HAVE NOW AN AVAILABLE CREDIT LIMIT OF SOME {$1500.00}. THEY REFUSE TO ALLOW ME TO USE THIS ACCOUNT? I AM VERY ANGRY ABOUT THIS TO SAY THE LEAST!"""

In [None]:
manual_product_model_test(TEST_STRING_CC, 'bank_service')
manual_product_model_test(TEST_STRING_CC, 'credit_reporting')
manual_product_model_test(TEST_STRING_CC, 'credit_card')

In [None]:
manual_model_test(TEST_STRING_BS, 2)
#manual_model_test(TEST_STRING_CC)

In [None]:
# Generate a sample from the original dataframe to validate the models on.
eval_df = td.sample(500)

In [None]:
manual_model_test(TEST_STRING_CC, 2)

In [None]:
eval_df_scored1 = score_df(eval_df, 'text', version=1)

In [None]:
thresholds1 = determine_thresholds(eval_df_scored1)

In [None]:
performance_metrics(eval_df_scored1, thresholds1)

In [None]:
eval_df_scored2 = score_df(eval_df, 'text', version=2)

In [None]:
thresholds2 = determine_thresholds(eval_df_scored2)

In [None]:
performance_metrics(eval_df_scored2, thresholds2)

In [None]:
labels_true_1 = list(eval_df_scored1['product_group'])
labels_predicted_1 = list(eval_df_scored1['predicted_product'])
labels_true_2 = list(eval_df_scored2['product_group'])
labels_predicted_2 = list(eval_df_scored2['predicted_product'])

In [None]:
cm1 = confusion_matrix(labels_true_1, labels_predicted_1, labels=products)
cm1

In [None]:
cm2 = confusion_matrix(labels_true_2, labels_predicted_2, labels=products)
cm2

In [None]:
plot_confusion_matrix({'Model 1': cm1, 'Model 2': cm2},
                          products,
                          cmap='YlOrRd',
                          normalize=True)

In [None]:
cohen_kappa_score(list(eval_df_scored1['product_group']), list(eval_df_scored1['predicted_product']), labels=products)

In [None]:
cohen_kappa_score(list(eval_df_scored2['product_group']), list(eval_df_scored2['predicted_product']), labels=products)