In [None]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline

# Preparing data

In [None]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
class_names = ['atheism', 'christian']
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)
train_vectors = train_vectors.toarray()
test_vectors = test_vectors.toarray()
tokenizer = vectorizer.build_tokenizer()

In [None]:
names_features = np.array(vectorizer.get_feature_names())

In [None]:
len(vectorizer.get_feature_names())

# Training the NLS

In [None]:
parameters = {
        'model__es_give_up_after_nepochs': [20]
        , 'model__hidden_size': [100, 250, 500]
        , 'model__num_layers': [1, 3, 5]
    }

comb_parameters = [{
        'es_give_up_after_nepochs': 20
        , 'hidden_size': 100
        , 'num_layers': 2
        , 'n_classification_labels': 2,
    }
        ]

In [None]:

for parameter in comb_parameters:
    model = NLS(
        verbose=0
        , es=True
        , gpu=True
        , scale_data=False
        , varying_theta0=False
        , fixed_theta0=True
        , dataloader_workers=0
        # , with_mean=False
        , **parameter
    ) 
    model.fit(x_train=train_vectors, y_train=newsgroups_train.target)

## Testing model

In [None]:
test_vectors = test_vectors
pred = model.predict(test_vectors)
print('F1 Score:', sklearn.metrics.f1_score(newsgroups_test.target, pred, average='binary'))
print('Accuracy', sklearn.metrics.accuracy_score(newsgroups_test.target, pred, normalize=True, sample_weight=None))

# Explaining a Document

In [None]:
idx = 83
# newsgroups_test.data[idx]
print('Document id: %d' % idx)
# print(model.predict(test_vectors))
print('Probability(christian) =', model.predict_proba([test_vectors[idx]]))

print('True class: %s' % class_names[newsgroups_test.target[idx]])
print('True class number:',newsgroups_test.target[idx] )
print('Text:')
print(newsgroups_test.data[idx])
x_explain = test_vectors[idx].reshape(1, -1)
document_explain = newsgroups_test.data[idx]

In [None]:
# for i in range(100):
#     idx = i
# #     # newsgroups_test.data[idx]
# #     print('Document id: %d' % idx)
# #     # print(model.predict(test_vectors))
# #     print('Probability(christian) =', model.predict_proba([test_vectors[idx]]))

# #     print('True class: %s' % class_names[newsgroups_test.target[idx]])
# #     print('True class number:',newsgroups_test.target[idx] )
# #     print('Text:')
#     print('*****************************')
#     print('*****************************')
#     print(newsgroups_test.data[idx])
#     x_explain = test_vectors[idx].reshape(1, -1)
#     document_explain = newsgroups_test.data[idx]

# Generating an Explanation

In [None]:
exp_NLS = ExplainText(model, class_names=['atheism', 'christian'], names_features=names_features)
# dict_exp = exp.get_text_explanation(x_explain, document=document_explain)

In [None]:
dict_exp = exp_NLS.get_text_explanation(x_explain, document=document_explain, num_features=10)

In [None]:
def plot_betas(dict_exp):
    fig, ax = plt.subplots()
    width = 0.35 
    ind = np.arange(len(dict_exp['words'][::-1]))
    bar_pos = plt.barh(ind-width/2, dict_exp['betas_document'][::-1], width, label='Atheism')
    plt.title('Atheism Class Feature Importance')
    bar_neg = plt.barh(ind + width/2, dict_exp['betas_document_neg'][::-1], width, label='Christian')
    plt.title('Atheism Christian Class Feature Importance')
    ax.set_yticks(ind)
    ax.set_yticklabels(dict_exp['words'][::-1])
    ax.grid(True, axis='y')
    leg = plt.legend()

In [None]:
plot_betas(dict_exp)

In [None]:
plot_exp = exp_NLS.explain_graphical(x_explain, document=document_explain, num_features=8)


In [None]:
names_features[dict_exp['indices_words']], dict_exp['indices_words']


In [None]:
document_html = exp_NLS.document_html(x_explain, document=document_explain, num_features=20)

In [None]:
document = dict_exp['document']

In [None]:
import re
prohibitedWords = ['on', 'Random', 'Words']
big_regex = re.compile('|'.join(map(re.escape, prohibitedWords)))
# the_message = big_regex.sub("<replaced>", 'this message contains Some really Random Words')
text = re.sub(r"\b(%s)\b" % "|".join(words), replace, text)
the_message

In [None]:
def replace(matched):
    # Matched.group(0) is the word that was found
    # Return the replacement
    return "REPLACEMENT"

In [None]:
words = ['a', 'an']
text = 'A word here, an b a'
text = re.sub(r"\b(%s)\b" % "|".join(words), replace, text)
text

In [None]:
def document_html(x_explain, document, num_features=10, tokenizer=None):
    exp = exp_NLS.get_text_explanation(x_explain, document, num_features=num_features)
    if tokenizer is None:
        return None
    document_html = document
    document_tokens = tokenizer(document)
    
    format_i = '<b style="background-color:Tomato;">' 
    format_e = '</b>'
    for words in exp['words']:
        if words in document_tokens:
            document_html = re.sub(r"\b(%s)\b" % "|".join(words), '{:}{:}{:}'.format(
                format_i, words, format_e), document_html)
#             document_html = re.sub(words , '{:}{:}{:}'.format(format_i, words, format_e) , document_html)
#     for words in document_tokens:
#         if words in exp['words']:
#             document_html += '<b>{:}</b>'.format(words)
#         else:
#             document_html += words
#         print(words)
        
    print('************************')
    print(document_html)
    return TextHTML(document_html)

In [None]:
class TextHTML:
    def __init__(self, html):
        self.html = html
    def _repr_html_(self):
        return self.html

In [None]:
def _segment_with_tokens(text, tokens):
    """Segment a string around the tokens created by a passed-in tokenizer"""
    list_form = []
    text_ptr = 0
    for token in tokens:
        inter_token_string = []
        while not text[text_ptr:].startswith(token):
            inter_token_string.append(text[text_ptr])
            text_ptr += 1
            if text_ptr >= len(text):
                raise ValueError("Tokenization produced tokens that do not belong in string!")
        text_ptr += len(token)
        if inter_token_string:
            list_form.append(''.join(inter_token_string))
        list_form.append(token)
    if text_ptr < len(text):
        list_form.append(text[text_ptr:])
    return list_form

In [None]:
a = document_html(x_explain, document, num_features=10, tokenizer=tokenizer)

In [None]:
a

# Explaining Lime

In [None]:
sparse_matrix = SparseMatrix()

In [None]:
c_1 = make_pipeline(vectorizer, sparse_matrix, model)

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names, random_state=65464)

In [None]:
exp = explainer.explain_instance(newsgroups_test.data[idx], c_1.predict_proba, num_features=10)

In [None]:
exp.as_list()

In [None]:
exp.show_in_notebook(text=True)

In [None]:
plot_exp = exp_NLS.explain_graphical(x_explain, document=document_explain, num_features=10)

# Explanation from the Data

In [None]:
# y_train_atheism = newsgroups_train.target[indices_atheism]
x_train = np.array(train_vectors)

In [None]:
indices_atheism = np.where(newsgroups_train.target == 0)[0]
x_train_atheism = x_train[indices_atheism]
important_words = x_train_atheism[:, dict_exp['indices_all_words']]
sum_atheism = np.sum(important_words, axis=0)

In [None]:
indices_christian = np.where(newsgroups_train.target == 1)[0]
x_train_christian = x_train[indices_christian]
important_words_christian = x_train_christian[:, dict_exp['indices_all_words']]
sum_christian = np.sum(important_words_christian, axis=0)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10), dpi=200)
ax.barh(names_features[dict_exp['indices_all_words']], sum_christian, label='christian')
ax.barh(names_features[dict_exp['indices_all_words']], sum_atheism, label='atheism', alpha=0.5)
leg = plt.legend()


In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
y_p = np.arange(len(names_features[dict_exp['indices_all_words']]))
vals = sum_atheism-sum_christian
colors = ['green' if x > 0 else 'red' for x in vals]
ax.barh(y_p, vals, color=colors)
ax.set_yticks(y_p)
ax.set_yticklabels(names_features[dict_exp['indices_all_words']])
x_lim = ax.get_xlim()
colors = ['tab:orange', 'tab:blue']
for i, y_p_i in enumerate(y_p):
    i_c = i%2
    ax.plot(x_lim, [y_p_i]*2, c=colors[i_c])
ax.set_xlim(x_lim)
title = ax.set_title('Atheism - Christain (frequency words)')
# leg = plt.legend()

# Removing: headers, footers, quotes

## Preparing data

In [None]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train_clean = fetch_20newsgroups(subset='train'
                                      , remove=('headers', 'footers', 'quotes')
                                      , categories=categories)
newsgroups_test_clean = fetch_20newsgroups(subset='test'
                                     , remove=('headers', 'footers', 'quotes')
                                     , categories=categories)
class_names = ['atheism', 'christian']
vectorizer_clean = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True)  # ngram_range=(1, 2)
train_vectors_clean = vectorizer_clean.fit_transform(newsgroups_train_clean.data)
test_vectors_clean = vectorizer_clean.transform(newsgroups_test_clean.data)
train_vectors_clean = train_vectors_clean.toarray()
test_vectors_clean = test_vectors_clean.toarray()
tokenizer_clean = vectorizer_clean.build_tokenizer()

In [None]:
names_features_clean = np.array(vectorizer_clean.get_feature_names())

In [None]:
len(vectorizer_clean.get_feature_names())

## Training the NLS

In [None]:
comb_parameters = [{
        'es_give_up_after_nepochs': 20
        , 'hidden_size': 100
        , 'num_layers': 2
        , 'n_classification_labels': 2,
    }
        ]

In [None]:
for parameter in comb_parameters:
    model_clean = NLS(
        verbose=0
        , es=True
        , gpu=True
        , scale_data=False
        , varying_theta0=False
        , fixed_theta0=True
        , dataloader_workers=0
        # , with_mean=False
        , **parameter
    ) 
    model_clean.fit(x_train=train_vectors_clean, y_train=newsgroups_train_clean.target)

## Testing model

In [None]:
pred = model_clean.predict(test_vectors_clean)
pred_poba = model_clean.predict_proba(test_vectors_clean)
print('Score:', sklearn.metrics.f1_score(newsgroups_test_clean.target, pred, average='binary'))

# Generating an Explanation

In [None]:
# alterando

In [None]:
exp_NLS_clean = ExplainText(model_clean
                             , class_names=['atheism', 'christian']
                             , names_features=names_features_clean
                            )

In [None]:
dict_exp_clean = exp_NLS_clean.get_text_explanation(x_explain_clean
                                              , document=document_explain_clean
                                              , num_features=20)

In [None]:
betas = model_clean.get_thetas(x_pred=x_explain_clean, net_scale=True)[2][0]

In [None]:
x_explain_clean

In [None]:
words_from_text_indices = np.argwhere(x_explain_clean[0] > 0).reshape(-1)

In [None]:
names_doc = names_features_clean[words_from_text_indices]

In [None]:
betas_r = betas[words_from_text_indices]

In [None]:
betas_sum = np.abs(betas_r[:, 0] - betas_r[:, 1])

In [None]:
indices = np.argsort(betas_sum)[::-1]
# names_doc[indices]

In [None]:
plot_betas(dict_exp_clean)

In [None]:
plot_exp = exp_NLS_clean.explain_graphical(x_explain_clean
                                           , document=document_explain_clean
                                           , num_features=20)


In [None]:
names_features_clean[dict_exp_clean['indices_words']], dict_exp_clean['indices_words']


# Explaining Lime

In [None]:
sparse_matrix = SparseMatrix()

In [None]:
c_clean = make_pipeline(vectorizer_clean, sparse_matrix, model_clean)

In [None]:
from lime.lime_text import LimeTextExplainer
explainer_clean = LimeTextExplainer(class_names=class_names, random_state=65464)

In [None]:
exp_clean = explainer_clean.explain_instance(newsgroups_test_clean.data[idx]
                                             , c_1.predict_proba
                                             , num_features=10)

In [None]:
exp_clean.as_list()

In [None]:
exp_clean.show_in_notebook(text=True)

# Training a RF

In [None]:
import sklearn.ensemble

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors_clean, newsgroups_train_clean.target)

## Testing RF Model

In [None]:
pred = rf.predict(test_vectors_clean)
sklearn.metrics.f1_score(newsgroups_test_clean.target, pred, average='binary')

In [None]:
rf.predict(x_explain_clean)

## Training a NN model

In [None]:
for parameter in comb_parameters:
    model_1 = NNPredict(
        verbose=1
        , es=True
        , gpu=True
        , dataloader_workers=0
        , **parameter
    )
    model_1.fit(x_train=train_vectors, y_train=newsgroups_train.target)

In [None]:
pred = model_1.predict(test_vectors)
print('Score:', sklearn.metrics.f1_score(newsgroups_test.target, pred, average='binary'))

In [None]:
c = make_pipeline(vectorizer, sparse_matrix, model_1) 

In [None]:
c.predict_proba([newsgroups_test.data[idx]])

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
class_names

In [None]:
exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=10)

In [None]:
#exp.as_list()

In [None]:
exp.show_in_notebook(text=False)

In [None]:
import numpy as np
from scipy.sparse import issparse

from matplotlib import pyplot as plt


def label_bar(rects, ax, labels=None, offset_y=0.4):
    colors = ['blue', 'orange']
    N = len(rects)
#     for rect, color in zip(rects, colors):
    for i in range(N):
        rect = rects[i]
#         color = colors[i]
        width = rect.get_width()
        text_width = '{:3.2f}'.format(width)
        if labels is None:
            text = text_width
            ax.annotate(text,
                    xy=(rect.get_width() / 2, rect.get_y() - offset_y + rect.get_height() / 2),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points"
                    , ha='center'
                    , va='bottom',
                    size=30)
        else:
            # alterando
            text = labels[i]
            ax.annotate(
                text_width
                , xy=(rect.get_width() / 2, (rect.get_y() + rect.get_height() / 2) - 0.25)
                , xytext=(0, -1)  # 3 points vertical offset
                , textcoords="offset points"
                , ha='center', va='bottom'
                , size=13
                , color='black'
                , horizontalalignment='right'
            )
            if rect.get_width() > 0:
                aling_text = 'right'
                off_setx = -3
            else:
                aling_text = 'left'
                off_setx = +3
           
            ax.annotate(
                text
                , xy=(rect.get_x(), rect.get_y())
                , xytext=(off_setx, 0)  # 3 points vertical offset
                , textcoords="offset points"
                , ha=aling_text
                , va='bottom'
                , size=14)
#         print(rect.get_x(), rect.get_y(), rect.get_height(), rect.get_width())
        
     

    
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)


class SparseMatrix(object):
    """
    Transformation to be used in a sklearn pipeline
    check if a array is sparse.
    # TODO: The NLS, LLS, NNPredict should accept sparse array
    """
    def __init__(self):
        pass

    def fit(self):
        return self

    @staticmethod
    def transform(x):
        if issparse(x):
            return x.toarray()
        return x


class ExplainText(object):
    def __init__(self, model, class_names, names_features):
        """
        :param model: NLS model;
        :param class_names: class names to be utilized in the plot;
        :param names_features: names of the features.
        """
        self.model = model
        self.class_names = np.array(class_names)
        self.names_features = np.array(names_features)

    def get_text_explanation(self, x_explain, document, num_features=10):
        """
        Get the explanation of text document.
        :param x_explain: document to be explained, should be vectorized;
        :param document: document in text format;
        :param num_features: number of features to produce the explanation.
        :return: betas values and words correspondent to the explanation.
        """
        explanation = self.model.get_thetas(x_pred=x_explain, net_scale=True)
        betas = explanation[2][0]
        words_from_text_indices = np.argwhere(x_explain[0] != 0).reshape(-1)
                
        # Prediction from the model
        prediction = self.model.predict(x_explain).reshape(-1)
        predict_proba = self.model.predict_proba(x_explain).reshape(-1)
        ind_pred_proba = np.argsort(predict_proba)[::-1]

        # Get the col_number of the predict and the second classes.
        col_betas = ind_pred_proba[0]
        col_betas_neg = ind_pred_proba[1]
        
        # Get the betas for the predict and second (neg) classes.
        betas_document = betas[words_from_text_indices, col_betas]
        betas_document_neg = betas[words_from_text_indices, col_betas_neg]

        betas_final = betas_document - betas_document_neg
        words_features_document = self.names_features[words_from_text_indices].reshape(-1)

        # Ranking the betas by absolute value - crescent order.
        beta_0_abs = np.abs(betas_final)
        betas_rank_ind = np.argsort(beta_0_abs)[::-1]
        # Selecting the first num_features important features.
        betas_rank_ind = betas_rank_ind[:num_features] 
        
        # Geting the important words.
        words_features_document_rank = words_features_document[betas_rank_ind]

        return dict(betas=betas_final[betas_rank_ind]
                    , betas_document=betas_document[betas_rank_ind]
                    , betas_document_neg=betas_document_neg[betas_rank_ind]
                    , words=words_features_document_rank
                    , prediction=prediction
                    , prediction_proba=predict_proba
                    , ind_class_sorted=ind_pred_proba
                    , document=document
                    , indices_words=words_from_text_indices[betas_rank_ind]
                    , indices_all_words=words_from_text_indices
                    )

    def document_html(self, x_explain, document, num_features=10, tokenizer=None):
        exp = self.get_text_explanation(x_explain, document, num_features=num_features)
        if tokenizer is None:
            return None
        document_html = ''
        document_tokens = tokenizer(document)
        for words in document_tokens:
            if words in exp['words']:
                document_html += words
            print(words)


    def explain_graphical(self, x_explain, document, num_features=10):
        exp = self.get_text_explanation(x_explain, document, num_features=num_features)
        fig, axs = plt.subplots(1, 3, figsize=(12, 5), dpi=200)
        colors = ['green', 'red']
        rects1 = axs[0].barh(self.class_names[::-1], exp['prediction_proba'][::-1], color=colors[::-1])
        axs[0].set_ylim([-3, 2])
        simpleaxis(axs[0])
        axs[0].set_xticks([])
        axs[0].tick_params('y', labelsize=20)
#         for rect, color in zip(rects1, colors):
#             rect.set_color(color)
        axs[0].set_title('Predicted Probabilities', size=20)
        
        label_bar(rects1, axs[0])
        names = exp['words'][::-1]
        vals = exp['betas'][::-1]
        class_names_sorted = self.class_names[exp['ind_class_sorted']]
        self.get_plot_feature_importance(axs[1], names, vals, class_names_sorted)
        
        
        simpleaxis(axs[2])
        axs[2].set_xticks([])
        axs[2].set_yticks([])
        axs[2].text(0, 1, '\n' + exp['document'], style='italic', wrap=True, va='top')
        axs[2].set_title('Document to Explain', size=20)
        plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=None)
        return fig, axs

    def get_plot_feature_importance(self, ax, names, vals, class_names):
        colors = ['green' if x > 0 else 'red' for x in vals]
        pos = np.arange(len(vals))
        rects2 = ax.barh(pos, vals, align='center', color=colors)
        ax.set_yticks([])
        ax.set_yticklabels([])
        ax.set_xticks([])
        ax.set_xticklabels([])
        label_bar(rects2, ax, labels=names)
        ax.axvline(0, color='black', lw=2)
        ax.set_title('Features Importance', size=20)
        y_lim = np.array(ax.get_ylim())
        ax.set_ylim(y_lim+np.array([0, 1.6]))
        
        ax.annotate('  ' + class_names[0], xy=(0,y_lim[1]), size=22, color='green', ha='left')
        ax.annotate(class_names[1] + '  ', xy=(0,y_lim[1]), size=22, color='red', ha='right')
        simpleaxis(ax)

In [None]:
exp_NLS = ExplainText(model, class_names=['atheism', 'christian'], names_features=names_features)
# dict_exp = exp.get_text_explanation(x_explain, document=document_explain)

In [None]:
dict_exp = exp_NLS.get_text_explanation(x_explain, document=document_explain, num_features=10)

In [None]:
plot_exp = exp_NLS.explain_graphical(x_explain, document=document_explain, num_features=10)
