In [1]:
%load_ext autoreload
%autoreload 2
import os
import os.path
import numpy as np
import sklearn
import sklearn.model_selection
import sklearn.linear_model
import sklearn.ensemble
import spacy
import sys
from sklearn.feature_extraction.text import CountVectorizer
from anchor import anchor_text
import time

---
### Load data

In [2]:
def load_polarity(path='../data/rt-polaritydata'):
    data = []
    labels = []
    f_names = ['rt-polarity.neg', 'rt-polarity.pos']
    for (l, f) in enumerate(f_names):
        for line in open(os.path.join(path, f), 'rb'):
            try:
                line.decode('utf8')
            except:
                continue
            data.append(line.strip())
            labels.append(l)
    return data, labels

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
data, labels = load_polarity()

In [8]:
train, test, train_labels, test_labels = \
    sklearn.model_selection.train_test_split(data, labels, test_size=.2, random_state=42)
train, val, train_labels, val_labels = \
    sklearn.model_selection.train_test_split(train, train_labels, test_size=.1, random_state=42)

In [17]:
train[:4]

[b'rare birds has more than enough charm to make it memorable .',
 b'mr . parker has brilliantly updated his source and grasped its essence , composing a sorrowful and hilarious tone poem about alienated labor , or an absurdist workplace sitcom .',
 b"shadyac shoots his film like an m . night shyamalan movie , and he frequently maintains the same snail's pace ; he just forgot to add any genuine tension .",
 b'diaz wears out her welcome in her most charmless performance']

In [18]:
train_labels[:4]

[1, 1, 0, 0]

Convert labels to np array

In [19]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

In [22]:
train_labels[:4]

array([1, 1, 0, 0])

---
### Text preprocessing

In [23]:
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(train)

CountVectorizer()

In [24]:
train_vectors = vectorizer.transform(train)

In [25]:
test_vectors = vectorizer.transform(test)

In [26]:
val_vectors = vectorizer.transform(val)

---

### Training

In [37]:
model = sklearn.linear_model.LogisticRegression( max_iter=1000 )
model.fit(train_vectors, train_labels)

LogisticRegression(max_iter=1000)

In [38]:
# predict
preds = model.predict(val_vectors)

In [39]:
print('Val accuracy', sklearn.metrics.accuracy_score(val_labels, preds))

Val accuracy 0.7544910179640718


In [40]:
def predict_text(text):
    return model.predict(vectorizer.transform(text))

---

### Explanation

In [43]:
# build explainer
explainer = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=True)

In [57]:
# use explainer
np.random.seed(1)
text = 'This is a good book .'
pred = explainer.class_names[predict_text([text])[0]]

In [58]:
pred

'positive'

In [59]:
exp = explainer.explain_instance(text, predict_text, threshold=0.95, verbose=False, use_proba=True)

In [60]:
exp

<anchor.anchor_explanation.AnchorExplanation at 0x7f0dc343aaf0>

In [61]:
exp.names()

['good']

In [62]:
' AND '.join(exp.names())

'good'

In [63]:
exp.precision()

1.0