In [15]:
import spacy
from spacy.tokens import DocBin

from tqdm import tqdm

import skweak
from skweak import utils

from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix



!pip install datasets
!pip install transformers

### Load imdb dataset

In [16]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Found cached dataset imdb (/Users/chris/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|██████████| 3/3 [00:00<00:00, 560.44it/s]


### Dataset labels

0 -- Negative -- 50%

1 -- Positive -- 50%

-1 -- Unlabelled


In [None]:
# python -m spacy download en_core_web_sm



In [17]:
spacy_model_name = "en_core_web_sm" # needed for DocBin -> Doc process

nlp = spacy.load(spacy_model_name)   # We load an English-language model



In [14]:
doc = nlp("This is a test.")

doc.text



'This is a test.'

In [18]:
def spacy_prep(data_dict):
    doc = nlp(data_dict["text"])
    doc.user_data["gold"] = data_dict["label"]
    return doc
    
def write_dataset_to_docbin(dataset_chunk, docbin, docbin_path):
    for data_dict in tqdm(dataset_chunk):
        docbin.add(spacy_prep(data_dict))
    if docbin_path:
        docbin.to_disk(docbin_path)
        
def write_docs_to_docbin(docs, docbin_path):
    docbin = DocBin(store_user_data=True)
    for doc in docs:
        docbin.add(doc)
    docbin.to_disk(docbin_path)
    
    

In [21]:
!mkdir sentiment_docbin

In [22]:
train_doc_bin = DocBin(store_user_data=True)
write_dataset_to_docbin(dataset["train"], train_doc_bin, "sentiment_docbin/train.docbin")


100%|██████████| 25000/25000 [17:07<00:00, 24.33it/s]


In [24]:
test_doc_bin = DocBin(store_user_data=True)
write_dataset_to_docbin(dataset["test"], test_doc_bin, "sentiment_docbin/test.docbin")

100%|██████████| 25000/25000 [17:07<00:00, 24.34it/s]


In [25]:
train_docs = list(utils.docbin_reader("sentiment_docbin/train.docbin", spacy_model_name=spacy_model_name))

test_docs = list(utils.docbin_reader("sentiment_docbin/test.docbin", spacy_model_name=spacy_model_name))

print(len(train_docs))
print(len(test_docs))



25000
25000


In [26]:
def gold_labels(docs):
    return [doc.user_data["gold"] for doc in docs]

train_true = gold_labels(train_docs)
test_true = gold_labels(test_docs)

In [27]:
from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore

class MockPositiveAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything positive"""

    def __init__(self):
        name = "mock_positive_annotator"
        super(MockPositiveAnnotator, self).__init__(name)

        print("Setup positive mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "1"
        
        
class MockNegativeAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything negative"""

    def __init__(self):
        name = "mock_negative_annotator"
        super(MockNegativeAnnotator, self).__init__(name)

        print("Setup negative mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "0"
        
mock_positive_annotator = MockPositiveAnnotator()
mock_negative_annotator = MockNegativeAnnotator()

Setup positive mock annotator
Setup negative mock annotator


In [28]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1"])

In [29]:
combi_mock = skweak.base.CombinedAnnotator()
combi_mock.add_annotator(mock_positive_annotator)
combi_mock.add_annotator(mock_negative_annotator)
combi_mock.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x373d15880>

In [31]:
mock_train_results = list(combi_mock.pipe(train_docs))
mock_test_results = list(combi_mock.pipe(train_docs))

write_docs_to_docbin(mock_train_results, "mock_train.docbin")
write_docs_to_docbin(mock_train_results, "mock_test.docbin")

In [32]:
mock_train_results = list(utils.docbin_reader("mock_train.docbin", spacy_model_name=spacy_model_name))

mock_test_results = list(utils.docbin_reader("mock_test.docbin", spacy_model_name=spacy_model_name))

In [33]:
mock_train_results[0].spans["mock_positive_annotator"][0].label_

'1'

In [34]:
def predicted_labels(docs, agg_name="maj_voter"):
    return [int(doc.spans[agg_name][0].label_) for doc in docs]

In [35]:
mock_train_labels = predicted_labels(mock_train_results)

In [36]:
mock_test_labels = predicted_labels(mock_test_results)

In [61]:
def calculate_metrics(y_true, y_pred):
    print(f"""Precision {precision_score(y_true, y_pred, average="macro")},
    Recall {recall_score(y_true, y_pred, average="macro")},
    F1 {f1_score(y_true, y_pred, average="macro")}
    
    ---
    Confusion matrix
    {confusion_matrix(y_true, y_pred)}
    """)

In [38]:
calculate_metrics(train_true, mock_train_labels)

Precision 0.0,
    Recall 0.0,
    F1 0.0
    
    ---
    Confusion matrix
    [[12500     0]
 [12500     0]]
    


  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
calculate_metrics(test_true, mock_train_labels)

Precision 0.0,
    Recall 0.0,
    F1 0.0
    
    ---
    Confusion matrix
    [[12500     0]
 [12500     0]]
    


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
from sklearn.metrics import precision_score
y_true = [0, 1, 1, 0, 1, 1]
y_pred = [0, 0, 0, 0, 0, 0]
precision_score(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [69]:
from skweak import gazetteers

NEGATIVE_WORDS = ["unhappy", "sad", "disappointing", "bad"]
NEGATIVE_WORDS = [[word] for word in NEGATIVE_WORDS]


POSITIVE_WORDS = ["happy", "great", "awesome", "amazing", "good", "fun"]
POSITIVE_WORDS = [[word] for word in POSITIVE_WORDS]
# NEGATIVE_WORDS = ["unhappy", ]

positive_trie = gazetteers.Trie(POSITIVE_WORDS)
negative_trie = gazetteers.Trie(NEGATIVE_WORDS)

gazetteer = gazetteers.GazetteerAnnotator("sent_gazetteer", {1:positive_trie, -1:negative_trie})

In [70]:
class GazetteerDocDetector(skweak.base.SpanAnnotator):
    def __init__(self):
        super(GazetteerDocDetector, self).__init__("gazetteer_doc_detector")

    def find_spans(self, doc):
        gazetteer_score = 0
        for start, end, score in gazetteer.find_spans(doc):
            gazetteer_score += score 
        if 0 < gazetteer_score:
            yield 0, len(doc), "1"
        # TODO set default to -1 (unlabelled) or 0 (negative)?
        elif gazetteer_score < 0:
            yield 0, len(doc), "0"
        else:
            yield 0, len(doc), "-1"
        
gazetteer_doc_detector = GazetteerDocDetector()

In [71]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1", "-1"])

In [72]:
combined_sentiment = skweak.base.CombinedAnnotator()
combined_sentiment.add_annotator(gazetteer_doc_detector)

combined_sentiment.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x2c32684c0>

In [73]:
gaz_train_docs = list(combined_sentiment.pipe(train_docs))
write_docs_to_docbin(gaz_train_docs, "trained_v1.docbin")

In [74]:
gaz_docs_test = list(combined_sentiment.pipe(test_docs))
write_docs_to_docbin(gaz_docs_test, "gaz_docs_test.docbin")

In [75]:
gaz_docs_train = list(utils.docbin_reader("trained_v1.docbin", spacy_model_name=spacy_model_name))

gaz_docs_test = list(utils.docbin_reader("gaz_docs_test.docbin", spacy_model_name=spacy_model_name))

In [76]:
gaz_docs_train_labels = predicted_labels(gaz_docs_train)
gaz_docs_test_labels = predicted_labels(gaz_docs_test)

In [81]:
from collections import Counter

C = Counter(gaz_docs_train_labels)

C

Counter({1: 11826, 0: 13174})

In [82]:
C = Counter(gaz_docs_test_labels)

C

Counter({1: 11621, -1: 10188, 0: 3191})

In [83]:
calculate_metrics(train_true, gaz_docs_train_labels)

Precision 0.6079939780632233,
    Recall 0.60768,
    F1 0.6073946380905813
    
    ---
    Confusion matrix
    [[7933 4567]
 [5241 7259]]
    


In [84]:
calculate_metrics(test_true, gaz_docs_test_labels)

Precision 0.4779863082816183,
    Recall 0.2602933333333333,
    F1 0.3085785817867689
    
    ---
    Confusion matrix
    [[   0    0    0]
 [5414 2613 4473]
 [4774  578 7148]]
    


  _warn_prf(average, modifier, msg_start, len(result))


In [89]:
import pandas as pd

train_texts = [doc.text for doc in gaz_train_docs]
train_weak_labels = gaz_docs_train_labels

test_texts = [doc.text for doc in gaz_docs_test]
test_weak_labels = gaz_docs_test

train_df = pd.DataFrame(list(zip(train_texts, train_weak_labels)),
               columns =['text', 'label'])

train_df = train_df[test_df.label != -1]


train_df.to_json("weak_imdb_train.jsonl", orient="records", lines=True)

test_df = pd.DataFrame(list(zip(train_texts, train_weak_labels)),
               columns =['text', 'label'])

test_df = test_df[test_df.label != -1]

test_df.to_json("weak_imdb_test.jsonl", orient="records", lines=True)


0        True
1        True
2        True
3        True
4        True
         ... 
24995    True
24996    True
24997    True
24998    True
24999    True
Name: label, Length: 25000, dtype: bool

In [None]:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg