In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

import os
import codecs
import sys
import textacy

# Automated Tagging of Maintenance Issues:
## A Keyword Detection and Ranking Approach
### Thurston Sexton + Mike Brundage
Also: compared to a Machine Learning one

In [2]:
data_directory = os.path.join('.', 'data')
raw_excel_filepath = os.path.join(data_directory, 'tag_data.xlsx')
raw_csv_filepath = os.path.join(data_directory, 'raw_csv_tagged.csv')
vocab_filepath = os.path.join(data_directory, 'tag_vocab.csv')

In [3]:
# read in the tagged issues, which have been cleaned of nasty unicode and combined Description+Resolution
df_raw = pd.read_csv(raw_csv_filepath, encoding=sys.getfilesystemencoding(), 
                     names = ['RawText','p_Item','p_Action','s_Action','s_Item'])
df_raw.head()
df_raw.iloc[142].RawText

u'Monitoring Spindle Drive alarm wonaTMt clear. Replaced performance module'

In [4]:
# read in the keyword dictionary and to some data-munging. 
df_vocab = pd.read_csv(vocab_filepath, header=0, encoding=sys.getfilesystemencoding(),
                       names = ['token', 'NE','alias','note'], index_col=0)
df_vocab = df_vocab.dropna(subset=['NE'])  # remove named entities that are NaN
df_vocab.alias = df_vocab.apply(lambda x: np.where(pd.isnull(x.alias), x.name, x.alias), axis=1) # alias to original if blank
df_vocab = df_vocab[~df_vocab.index.duplicated(keep='first')]
df_vocab.head()

Unnamed: 0_level_0,NE,alias,note
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
replace,S,replace,
unit,I,unit,
motor,I,motor,
spindle,I,spindle,
leak,P,leak,


In [5]:
# how many instances of each keyword class are there?
df_vocab.groupby("NE").nunique()

Unnamed: 0_level_0,NE,alias,note
NE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I,1,366,25
P,1,95,14
R,1,142,0
S,1,70,11
X,1,54,3


In [6]:
# start up our NLP engine, Spacy wrapped in Textacy
docs = textacy.fileio.read.read_csv(raw_csv_filepath, encoding='utf-8')

content_stream, metadata_stream = textacy.fileio.split_record_fields(docs, 1)  # Descriptions in Col 6
corpus = textacy.Corpus(u'en', texts=content_stream, metadatas=metadata_stream)


In [7]:
## THIS GENERATED THE TOP N MOST IMPORTANT TOKENS VIA A DOC_TERM_MATRIX
## the engineers used this code-snippet to make tag_vocab.csv

# from unicodedata import normalize

# topn = 3000
# topn_tok = [id2term[i] for i in doc_term_matrix.sum(axis=0).argsort()[0,-topn:].tolist()[0][::-1]]
# with open('new_top{}.txt'.format(topn), 'wb') as f:
#     for i in topn_tok:
#         try:
#             f.write(i+'\n')
#         except UnicodeEncodeError:
#             print i, '-->', normalize('NFKD', i).encode('ascii','ignore')
#             f.write(normalize('NFKD', i).encode('ascii','ignore') +'\n')
            

In [8]:
def get_norm_tokens(doc_n, doc_term_mat, id2term):
    doc = doc_term_mat[doc_n].toarray()
    return [id2term[i] for i in doc.nonzero()[1]]

def doc_to_tags(tokens, thes):
#     tokens = get_norm_terms(doc)
    tags = {'I':[], 'P':[], 'S':[]}
    untagged = []
    vocab_list = thes.index.tolist()
    for tok in tokens:
        if tok in vocab_list:  # recognized token?
            typ = thes.loc[tok]['NE']
            
            if typ in tags.keys():  #  I, P, or S?
                tags[typ] = list(set(tags[typ] + [thes.loc[tok]['alias'].tolist()]))
            else:  # R or X?
                pass # skip'em
        elif np.any([i in vocab_list for i in tok.split(' ')]):
            # If any subset of `tok` is itself a recognized token, skip'em
            pass
        else: # not recognized :(
            untagged += [tok]
    return tags, list(set(untagged))
        
            
def tag_corpus(corpus, thes):
    RT, I, S, P, UK = ([], [], [], [], [])
    
    # make the tf-idf embedding to tokenize with lemma/ngrams
    doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
            (doc.to_terms_list(ngrams=(1,2,3), 
                               normalize=u'lemma',
                               named_entities=False, 
#                                filter_stops=True,  # Nope! Not needed :)
                               filter_punct=True,
                               as_strings=True)
                for doc in corpus),
            weighting='tfidf', 
            normalize=False, 
            smooth_idf=False, 
            min_df=2, max_df=0.95)  # each token in >2 docs, <95% of docs
    # iterate over all issues
    for  doc_n, doc in tqdm_notebook(enumerate(corpus)):
        tokens = get_norm_tokens(doc_n, doc_term_matrix, id2term)
        tags, unknown = doc_to_tags(tokens, thes)
        UK += [', '.join(unknown)]
        RT += [doc.text]
        I += [', '.join(tags['I'])]
        S += [', '.join(tags['S'])]
        P += [', '.join(tags['P'])]
    # get back a tagged DF
    return pd.DataFrame(data={
        'RawText': RT,
        'Items': I,
        'Problem': P,
        'Solution': S,
        'UK_tok': UK  # unknown
    }, columns = ['RawText','Items','Problem','Solution','UK_tok'])

df_pred = tag_corpus(corpus, df_vocab)
df_pred.head()




Unnamed: 0,RawText,Items,Problem,Solution,UK_tok
0,No power. Replaced pin in pendant and powered ...,"machine, cable, pendant, pin","short, power",replace,possible
1,Smartscope harness broken. Parts ordered / Tec...,part,broken,"repair, order","harness, tech, smartscope"
2,Check / Charge Accumulators. Where OK,,,"charge, check",accumulators
3,Hyd leak at saw atachment. Replaced seal in sa...,"seal, saw, attachment, hydraulic, saw_attachment",leak,replace,ml
4,CS1008 setup change over / from ARC1004. Compl...,"unit, thread, thread_unit",,"setup, complete, change",rewire


In [9]:
# save everything to disk
df_pred.to_excel('keyword_tagged.xlsx')

#### How many have no remaining Unknown Tokens?
i.e. the mapping from token-space (domain) --> tag-space (codomain) is a surjection in the space of this issue

In [10]:
df_pred[df_pred.UK_tok ==''].shape[0]

773

Note that the 3438-773 = 2665 others *apparently* have extra information to be extracted, or at least, *we cannot be certain that there isn't*.

#### How many got NO datafication?
i.e. for how many issues was this process completely worthless? 

In [11]:
# df_pred[df_pred.UK_tok ==''].dropna().shape
print df_pred[(df_pred[['Items','Problem','Solution']]=='').all(axis=1)].shape[0]
df_pred[(df_pred[['Items','Problem','Solution']]=='').all(axis=1)]

8


Unnamed: 0,RawText,Items,Problem,Solution,UK_tok
630,Unload automation not returning.,,,,"unload, return, automation"
1816,.,,,,
2318,Saftey paint required on platform.,,,,"platform, require, paint require, paint"
2467,.,,,,
2571,Camshaft standstill. Gary!!,,,,camshaft
2834,Dead spots on touch screen; Not always functio...,,,,"screen, dead spot, spot, functional, dead, tou..."
3191,Disti water empty. Water filled;,,,,"water, disti, fill"
3405,??.,,,,


### How well did we do at automating the job of Tagging Issues?

In order to somehow measure our success, we need something to compare with. Thankfully, some hard-working engineers have gone through and manually tagged over 1200 maintenance issues **by hand**. We can use these tags as the "gold standard" tags, with which to compare our automated keyword-->tag mapping. 

In [12]:
df_tag = pd.read_excel(raw_excel_filepath, header=1, encoding=sys.getfilesystemencoding(), 
                   names=['Description','Resolution','p_Item','p_Action','s_Action','s_Item'])
mask = df_tag[['p_Item','p_Action','s_Action','s_Item']].notnull().any(axis=1)

from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer

human_tags = [list(chain(*[tags.split(', ') for tags in issue[1].values if type(tags)==unicode])) 
              for issue in df_tag[['p_Item','p_Action','s_Action','s_Item']][mask].iterrows()]

recog_tags =  [list(chain(*[tags.split(', ') for tags in issue[1].values if type(tags)==unicode])) 
              for issue in df_pred[['Items','Problem','Solution']][mask].iterrows()]

print human_tags[0]


multi_bin =  MultiLabelBinarizer().fit(human_tags + recog_tags)
Y_true = multi_bin.transform(human_tags)
print Y_true[0].sum()

Y_train = multi_bin.transform(recog_tags)



[u'pendant_cable', u'cable', u'short', u'no_power', u'replace', u'pin']
6


#### But how will we measure this? 

One way is straight-forward and easy to compute...the accuracy. The **accuracy score** measures, on average, how many predicted outputs match the true outputs perfectly:

$$ S_A = \frac{1}{n}\sum_{i=1}^{n} \mathbb{1}(T_i = P_i)$$

#### A better alternative :

This is an *overly-harsh metric* for the performance of multilabel classification, since it ignores *how close* we got to the correct output in each case. The **hamming score** is a more forgiving and/or useful metric:

$$ S_H = \frac{1}{n}\sum_{i=1}^{n} \frac{|T_i \cap P_i|}{|T_i \cup P_i|}$$

Note that the closely related **hamming loss** is similar to a distance metric, which unlike the others here is *better when low*.

#### Interpretability, please: 
Finally, we can use the slightly more intuitive **precision**, **recall**, and their harmonic mean **$F_1$-score**. As put by Scikit-Learn: 
> Intuitively, *precision* is the ability of the classifier *not to label as positive a sample that is negative*, and *recall* is the ability of the classifier *to find all the positive samples*.

Then, we can get some sort of combination that balances the two, embodied by $F_1$. Or, put formally:

$$ Pr = \frac{1}{n}\sum_{i=1}^{n} \frac{|T_i \cap P_i|}{|P_i|} $$
$$ Re = \frac{1}{n}\sum_{i=1}^{n} \frac{|T_i \cap P_i|}{|T_i|} $$
$$ F_1 = \frac{1}{n}\sum_{i=1}^{n} \frac{2Pr_i Re_i}{Pr_i+Re_i}$$

If we want to model some difference between our importance of recall vs. precision, the generalized $F$-score is defined as:

$$ F_{\beta} = \frac{1}{n}\sum_{i=1}^{n} (1+\beta^2)\frac{Pr_i Re_i}{\beta^2 Pr_i+Re_i} $$

From  Van Rijsbergen, this is defined so that $F_{\beta}$:
> "measures the effectiveness of retrieval with respect to a user who attaches $\beta$ times as much importance to recall as precision".

In our case, Since we do not really trust that the original tags given by humans were all-inclusive (i.e. they might have left out tags below some un-known relevance threshold determined by their attention [read: boredom] level), we want to place more importance on recall in our $F$-measure. We'll use the commonly-applied $F_2$

In [13]:
from sklearn.metrics import hamming_loss, accuracy_score, precision_recall_fscore_support
from scipy.stats import hmean  # harmonic mean

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def f_score(y_true, y_pred, beta=1.):
    '''
    Compute the Precision, Recall, and F-beta Score for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    fsc_list = []
    pre_list = []
    rec_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_p, tmp_r, tmp_f = 1, 1, 1
        elif len(set_true.intersection(set_pred)) ==0:
            tmp_p = 0
            tmp_r = 0
            tmp_f = 0
        else:
            tmp_p = len(set_true.intersection(set_pred))/\
                    float( len(set_pred) ) 
            tmp_r = len(set_true.intersection(set_pred))/\
                    float( len(set_true) )
            try:
                tmp_f = ((1.+beta**2)*tmp_p*tmp_r)/((beta**2)*tmp_p + tmp_r)
            except ValueError:
                print tmp_p, tmp_r
                raise
        #print('tmp_a: {0}'.format(tmp_a))
        pre_list.append(tmp_p)
        rec_list.append(tmp_r)
        fsc_list.append(tmp_f)
    return np.array(pre_list), np.array(rec_list), np.array(fsc_list)

print '---Automatic Keyword Tagging (TF-IDF+human) ---'
print 'Accuracy Score: \t {:.2%}\nHamming Score: \t {:.2%}\nHamming Loss: \t {:.2e}'.format(accuracy_score(Y_true, Y_train),
                                                                          hamming_score(Y_true, Y_train),
                                                                          hamming_loss(Y_true, Y_train))

print '\nPrecision: \t {:.2%}\nRecall: \t {:.2%}\nF1 Score: \t {:.2%}'.format(
    *[np.mean(i) for i in f_score(Y_true, Y_train, beta=2.)]
)
# print '\nPrecision: \t {:.2%}\nRecall: \t {:.2%}\nF1 Score: \t {:.2%}'.format(
#     *[np.mean(i) for i in precision_recall_fscore_support(Y_true, Y_train)[:-1]]
# )

---Automatic Keyword Tagging (TF-IDF+human) ---
Accuracy Score: 	 2.16%
Hamming Score: 	 39.06%
Hamming Loss: 	 3.82e-03

Precision: 	 46.51%
Recall: 	 68.23%
F1 Score: 	 59.05%


### Is there another way? 

Now, it's important to note that all of the above was done **only** with a list of categorized keywords (i.e. some mapping from token-space to tag-space), and creating that mapping did not at all depend upon some human having tagged *individual issues* already...we were just using those tagged issues as a scoring/benchmark tool. 

If we approach this as a classification problem, assuming these tagged issues *do exist*, we might attempt to train a classifier to predict the set of tags appropriate for given **raw-english** input.

Let's try this: 
- first a mapping from token-space to some useful vector-space (could be tf-idf, maybe a topic model, but here we will use the shiny new **Word2Vec** semantic embedding vectors of our corpus, courtesy of Google+Textacy/Spacy). 
- Then we will train a **classifier** to exactly match the **Multilabel** output, represented by the individual human-tagged issues. 

Support-vector-machines work amazingly well on text embedding classification jobs, so let's use a linear SVC trained using stochasic gradient descent (SGD) via sklearn. We should also minimize overfitting with this hugely dimensional job, so we'll regularize with an elasticnet penalty (L1+L2).  

In [16]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

X_train = corpus.vectors[mask]
clf = OneVsRestClassifier(SGDClassifier(#class_weight='balanced',  # compensate for class freqs
                                        penalty='elasticnet'   # L1 + L2 regularized
                                        ), n_jobs=3  # 3-cores for the one-vs-all 
                         )
clf.fit(X_train, Y_true)
Y_train_w2v = clf.predict(X_train)

In [17]:
print '---W2V Embeddings with SVC (OneVsRest, SGD)---'
print 'Accuracy Score: \t {:.2%}\nHamming Score: \t {:.2%}\nHamming Loss: \t {:.2e}'.format(accuracy_score(Y_true, Y_train_w2v),
                                                                          hamming_score(Y_true, Y_train_w2v),
                                                                          hamming_loss(Y_true, Y_train_w2v))
print '\nPrecision: \t {:.2%}\nRecall: \t {:.2%}\nF1 Score: \t {:.2%}'.format(
    *[np.mean(i) for i in f_score(Y_true, Y_train_w2v, beta=2.)]
)
# print '\nPrecision: \t {:.2%}\nRecall: \t {:.2%}\nF1 Score: \t {:.2%}'.format(
#     *[np.mean(i) for i in precision_recall_fscore_support(Y_true, Y_train_w2v)[:-1]]
# )

---W2V Embeddings with SVC (OneVsRest, SGD)---
Accuracy Score: 	 23.58%
Hamming Score: 	 60.48%
Hamming Loss: 	 1.91e-03

Precision: 	 79.48%
Recall: 	 71.30%
F1 Score: 	 70.12%


Wow, that's pretty fantastic, considering the task we've set before it! Better performance, at least on a per-metric level, than our automated keyword-tagger in every way! 

Still, note that the precision is almost 80%...this may or may not be a model we want to actually use early on, given that we may or may not trust our engineers' tagging job. Assuming We 100% trust the keyword flagger when it recognizes a word, the precision of that one is actually 100%, and any discrepancy is on the part of *the original tags*. 

Another way to look at it is our keyword tagger is being really nit-picky and overly-accurate, which precision is punishing. Our classifier, on the other hand, is being trained *to tag like the humans*. 

Let's dig in a bit.. To get a more fine-grained idea of what's going on, we can also look at the Pr/Re/F scores on an individual-issue level to get a better idea of the performance of each model. 

In [20]:
# from __future__ import print_function
from ipywidgets import interact, interactive
import ipywidgets as widgets
from IPython.display import display

def compare_by_issue(iss):
    print 'Issue No. ',iss
    print df_pred[mask]['RawText'].iloc[iss]
    print '\nHuman-tagged \"True\" Keyworks/Tags: \t{}'.format(', '.join(sorted(multi_bin.inverse_transform(Y_true)[iss])))

    print '\nTF-IDF rank Human-classified keywords: \t{}'.format(', '.join(sorted(multi_bin.inverse_transform(Y_train)[iss])))
    print 'Precision: \t {:.2%}\nRecall: \t {:.2%}\nF2 Score: \t {:.2%}'.format(
        *[i[iss] for i in f_score(Y_true, Y_train, beta=2.)]
    )
    print '\nWord2Vec + SVM Multilabel Classifier: \t{}'.format(', '.join(sorted(multi_bin.inverse_transform(Y_train_w2v)[iss])))
    print 'Precision: \t {:.2%}\nRecall: \t {:.2%}\nF2 Score: \t {:.2%}'.format(
        *[i[iss] for i in f_score(Y_true, Y_train_w2v, beta=2.)]
    )
compare_by_issue(0)

Issue No.  0
No power. Replaced pin in pendant and powered machine; Possible short in pendant cable

Human-tagged "True" Keyworks/Tags: 	cable, no_power, pendant_cable, pin, replace, short

TF-IDF rank Human-classified keywords: 	cable, machine, pendant, pin, power, replace, short
Precision: 	 57.14%
Recall: 	 66.67%
F2 Score: 	 64.52%

Word2Vec + SVM Multilabel Classifier: 	cable, error, no_power, pin, replace
Precision: 	 80.00%
Recall: 	 66.67%
F2 Score: 	 68.97%


In [21]:
interact(compare_by_issue, iss=widgets.BoundedIntText(
                    value=0,
                    min=0,
                    max=3437,
                    description='Pick an Issue No.:',
                    step=1
                    )
               );
# display(w)