In [24]:
import spacy
from spacy.tokens import DocBin

from tqdm import tqdm

import skweak
from skweak import utils

from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

!pip install datasets
!pip install transformers

### Load imdb dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Found cached dataset imdb (/home/chris/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

### Dataset labels

0 -- Negative -- 50%

1 -- Positive -- 50%

-1 -- Unlabelled


In [3]:
spacy_model_name = "en_core_web_sm" # needed for DocBin -> Doc process

nlp = spacy.load(spacy_model_name)   # We load an English-language model

In [4]:
def spacy_prep(data_dict):
    doc = nlp(data_dict["text"])
    doc.user_data["gold"] = data_dict["label"]
    return doc
    
def write_dataset_to_docbin(dataset_chunk, docbin, docbin_path):
    for data_dict in tqdm(dataset_chunk):
        docbin.add(spacy_prep(data_dict))
    if docbin_path:
        docbin.to_disk(docbin_path)
        
def write_docs_to_docbin(docs, docbin_path):
    docbin = DocBin(store_user_data=True)
    for doc in docs:
        docbin.add(doc)
    docbin.to_disk(docbin_path)

In [5]:
# train_doc_bin = DocBin(store_user_data=True)
# unsupervised_doc_bin = DocBin(store_user_data=True)
# test_doc_bin = DocBin(store_user_data=True)


# write_dataset_to_docbin(dataset["train"], train_doc_bin, "sentiment_docbin/train.docbin")

# write_dataset_to_docbin(dataset["unsupervised"], unsupervised_doc_bin, "sentiment_docbin/unsupervised.docbin")

# write_dataset_to_docbin(dataset["test"], test_doc_bin, "sentiment_docbin/test.docbin")

In [6]:
train_docs = list(utils.docbin_reader("sentiment_docbin/train.docbin", spacy_model_name=spacy_model_name))

test_docs = list(utils.docbin_reader("sentiment_docbin/test.docbin", spacy_model_name=spacy_model_name))

In [7]:
def gold_labels(docs):
    return [doc.user_data["gold"] for doc in docs]

train_true = gold_labels(train_docs)
test_true = gold_labels(test_docs)

In [8]:
from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore

class MockPositiveAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything positive"""

    def __init__(self):
        name = "mock_positive_annotator"
        super(MockPositiveAnnotator, self).__init__(name)

        print("Setup positive mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "1"
        
        
class MockNegativeAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything negative"""

    def __init__(self):
        name = "mock_negative_annotator"
        super(MockNegativeAnnotator, self).__init__(name)

        print("Setup negative mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "0"
        
mock_positive_annotator = MockPositiveAnnotator()
mock_negative_annotator = MockNegativeAnnotator()

Setup positive mock annotator
Setup negative mock annotator


In [9]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1"])

In [10]:
combi_mock = skweak.base.CombinedAnnotator()
combi_mock.add_annotator(mock_positive_annotator)
combi_mock.add_annotator(mock_negative_annotator)
combi_mock.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x7faf00645060>

In [11]:
# mock_train_results = list(combi_mock.pipe(train_docs))
# mock_test_results = list(combi_mock.pipe(train_docs))

# write_docs_to_docbin(mock_train_results, "mock_train.docbin")
# write_docs_to_docbin(mock_train_results, "mock_test.docbin")

In [12]:
mock_train_results = list(utils.docbin_reader("mock_train.docbin", spacy_model_name=spacy_model_name))

mock_test_results = list(utils.docbin_reader("mock_test.docbin", spacy_model_name=spacy_model_name))

In [14]:
mock_train_results[0].spans["mock_positive_annotator"][0].label_

'1'

In [15]:
def predicted_labels(docs, agg_name="maj_voter"):
    return [int(doc.spans[agg_name][0].label_) for doc in docs]

In [16]:
mock_train_labels = predicted_labels(mock_train_results)

In [17]:
mock_test_labels = predicted_labels(mock_test_results)

In [25]:
def calculate_metrics(y_true, y_pred):
    print(f"""Precision {precision_score(y_true, y_pred)},
    Recall {recall_score(y_true, y_pred)},
    F1 {f1_score(y_true, y_pred)}
    
    ---
    Confusion matrix
    {confusion_matrix(y_true, y_pred)}
    """)

In [26]:
calculate_metrics(train_true, mock_train_labels)

Precision 0.0,
    Recall 0.0,
    F1 0.0
    
    ---
    Confusion matrix
    [[12500     0]
 [12500     0]]
    


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
calculate_metrics(test_true, mock_train_labels)

Precision 0.0,
    Recall 0.0,
    F1 0.0
    
    ---
    Confusion matrix
    [[12500     0]
 [12500     0]]
    


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
from sklearn.metrics import precision_score
y_true = [0, 1, 1, 0, 1, 1]
y_pred = [0, 0, 0, 0, 0, 0]
precision_score(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [29]:
from skweak import gazetteers

NEGATIVE_WORDS = ["unhappy", "sad", "disappointing", "bad"]
NEGATIVE_WORDS = [[word] for word in NEGATIVE_WORDS]


POSITIVE_WORDS = ["happy", "great", "awesome", "amazing", "good", "fun"]
POSITIVE_WORDS = [[word] for word in POSITIVE_WORDS]
# NEGATIVE_WORDS = ["unhappy", ]

positive_trie = gazetteers.Trie(POSITIVE_WORDS)

gazetteer = gazetteers.GazetteerAnnotator("sent_gazetteer", {1:positive_trie})

In [30]:
class GazetteerDocDetector(skweak.base.SpanAnnotator):
    def __init__(self):
        super(GazetteerDocDetector, self).__init__("gazetteer_doc_detector")

    def find_spans(self, doc):
        gazetteer_score = 0
        for start, end, score in gazetteer.find_spans(doc):
            gazetteer_score += score 
        if 0 < gazetteer_score:
            yield 0, len(doc), "1" # type: ignore
        # TODO set default to -1 (unlabelled) or 0 (negative)?
        else:
            yield 0, len(doc), "0" # type: ignore
        
gazetteer_doc_detector = GazetteerDocDetector()

In [31]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1"])

In [32]:
combined_sentiment = skweak.base.CombinedAnnotator()
combined_sentiment.add_annotator(gazetteer_doc_detector)

combined_sentiment.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x7faedf6ddff0>

In [None]:
# gaz_train_docs = list(combined_sentiment.pipe(train_docs))
# write_docs_to_docbin(gaz_train_docs, "trained_v1.docbin")

In [33]:
gaz_docs_test = list(combined_sentiment.pipe(test_docs))
write_docs_to_docbin(gaz_docs_test, "gaz_docs_test.docbin")

In [34]:
gaz_docs_train = list(utils.docbin_reader("trained_v1.docbin", spacy_model_name=spacy_model_name))

gaz_docs_test = list(utils.docbin_reader("gaz_docs_test.docbin", spacy_model_name=spacy_model_name))

In [35]:
gaz_docs_train_labels = predicted_labels(gaz_docs_train)
gaz_docs_test_labels = predicted_labels(gaz_docs_test)

In [36]:
calculate_metrics(train_true, gaz_docs_train_labels)

Precision 0.5524544179523142,
    Recall 0.63024,
    F1 0.5887892376681615
    
    ---
    Confusion matrix
    [[6118 6382]
 [4622 7878]]
    


In [37]:
calculate_metrics(train_true, gaz_docs_test_labels)

Precision 0.5522292083452415,
    Recall 0.61832,
    F1 0.5834088164251208
    
    ---
    Confusion matrix
    [[6233 6267]
 [4771 7729]]
    
