In [11]:
import spacy
from spacy.tokens import DocBin

from tqdm import tqdm

import skweak
from skweak import utils

from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

!pip install datasets
!pip install transformers

### Load imdb dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset imdb (/Users/chris/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|██████████| 3/3 [00:00<00:00, 527.03it/s]


In [8]:
dataset["train"][0]["label"] = 1

In [9]:
dataset["train"][0]["label"]

0

### Dataset labels

0 -- Negative -- 50%

1 -- Positive -- 50%

-1 -- Unlabelled


In [None]:
# python -m spacy download en_core_web_sm

In [12]:
spacy_model_name = "en_core_web_sm" # needed for DocBin -> Doc process

nlp = spacy.load(spacy_model_name)   # We load an English-language model

In [14]:
doc = nlp("This is a test.")

doc.text

'This is a test.'

In [4]:
def spacy_prep(data_dict):
    doc = nlp(data_dict["text"])
    doc.user_data["gold"] = data_dict["label"]
    return doc
    
def write_dataset_to_docbin(dataset_chunk, docbin, docbin_path):
    for data_dict in tqdm(dataset_chunk):
        docbin.add(spacy_prep(data_dict))
    if docbin_path:
        docbin.to_disk(docbin_path)
        
def write_docs_to_docbin(docs, docbin_path):
    docbin = DocBin(store_user_data=True)
    for doc in docs:
        docbin.add(doc)
    docbin.to_disk(docbin_path)

In [5]:
# train_doc_bin = DocBin(store_user_data=True)
# unsupervised_doc_bin = DocBin(store_user_data=True)
# test_doc_bin = DocBin(store_user_data=True)


# write_dataset_to_docbin(dataset["train"], train_doc_bin, "sentiment_docbin/train.docbin")

# write_dataset_to_docbin(dataset["unsupervised"], unsupervised_doc_bin, "sentiment_docbin/unsupervised.docbin")

# write_dataset_to_docbin(dataset["test"], test_doc_bin, "sentiment_docbin/test.docbin")

In [6]:
train_docs = list(utils.docbin_reader("sentiment_docbin/train.docbin", spacy_model_name=spacy_model_name))

test_docs = list(utils.docbin_reader("sentiment_docbin/test.docbin", spacy_model_name=spacy_model_name))

In [7]:
def gold_labels(docs):
    return [doc.user_data["gold"] for doc in docs]

train_true = gold_labels(train_docs)
test_true = gold_labels(test_docs)

In [8]:
from typing import Iterable, Optional, Sequence, Tuple

from spacy.tokens import Doc, Span  # type: ignore

class MockPositiveAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything positive"""

    def __init__(self):
        name = "mock_positive_annotator"
        super(MockPositiveAnnotator, self).__init__(name)

        print("Setup positive mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "1"
        
        
class MockNegativeAnnotator(skweak.base.SpanAnnotator):
    """Annotate everything negative"""

    def __init__(self):
        name = "mock_negative_annotator"
        super(MockNegativeAnnotator, self).__init__(name)

        print("Setup negative mock annotator")

    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        yield 0, len(doc), "0"
        
mock_positive_annotator = MockPositiveAnnotator()
mock_negative_annotator = MockNegativeAnnotator()

Setup positive mock annotator
Setup negative mock annotator


In [9]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1"])

In [10]:
combi_mock = skweak.base.CombinedAnnotator()
combi_mock.add_annotator(mock_positive_annotator)
combi_mock.add_annotator(mock_negative_annotator)
combi_mock.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x7faf00645060>

In [11]:
# mock_train_results = list(combi_mock.pipe(train_docs))
# mock_test_results = list(combi_mock.pipe(train_docs))

# write_docs_to_docbin(mock_train_results, "mock_train.docbin")
# write_docs_to_docbin(mock_train_results, "mock_test.docbin")

In [12]:
mock_train_results = list(utils.docbin_reader("mock_train.docbin", spacy_model_name=spacy_model_name))

mock_test_results = list(utils.docbin_reader("mock_test.docbin", spacy_model_name=spacy_model_name))

In [14]:
mock_train_results[0].spans["mock_positive_annotator"][0].label_

'1'

In [15]:
def predicted_labels(docs, agg_name="maj_voter"):
    return [int(doc.spans[agg_name][0].label_) for doc in docs]

In [3]:
mock_train_labels = predicted_labels(mock_train_results)

NameError: name 'predicted_labels' is not defined

In [None]:
mock_test_labels = predicted_labels(mock_test_results)

In [None]:
def calculate_metrics(y_true, y_pred):
    print(f"""Precision {precision_score(y_true, y_pred)},
    Recall {recall_score(y_true, y_pred)},
    F1 {f1_score(y_true, y_pred)}
    
    ---
    Confusion matrix
    {confusion_matrix(y_true, y_pred)}
    """)

In [None]:
calculate_metrics(train_true, mock_train_labels)

In [None]:
calculate_metrics(test_true, mock_train_labels)

In [None]:
from sklearn.metrics import precision_score
y_true = [0, 1, 1, 0, 1, 1]
y_pred = [0, 0, 0, 0, 0, 0]
precision_score(y_true, y_pred)

In [None]:
from skweak import gazetteers

NEGATIVE_WORDS = ["unhappy", "sad", "disappointing", "bad"]
NEGATIVE_WORDS = [[word] for word in NEGATIVE_WORDS]


POSITIVE_WORDS = ["happy", "great", "awesome", "amazing", "good", "fun"]
POSITIVE_WORDS = [[word] for word in POSITIVE_WORDS]
# NEGATIVE_WORDS = ["unhappy", ]

positive_trie = gazetteers.Trie(POSITIVE_WORDS)

gazetteer = gazetteers.GazetteerAnnotator("sent_gazetteer", {1:positive_trie})

In [None]:
class GazetteerDocDetector(skweak.base.SpanAnnotator):
    def __init__(self):
        super(GazetteerDocDetector, self).__init__("gazetteer_doc_detector")

    def find_spans(self, doc):
        gazetteer_score = 0
        for start, end, score in gazetteer.find_spans(doc):
            gazetteer_score += score 
        if 0 < gazetteer_score:
            yield 0, len(doc), "1"
        # TODO set default to -1 (unlabelled) or 0 (negative)?
        else:
            yield 0, len(doc), "0"
        
gazetteer_doc_detector = GazetteerDocDetector()

In [None]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["0", "1"])

In [None]:
combined_sentiment = skweak.base.CombinedAnnotator()
combined_sentiment.add_annotator(gazetteer_doc_detector)

combined_sentiment.add_annotator(majority_voter)

In [None]:
# gaz_train_docs = list(combined_sentiment.pipe(train_docs))
# write_docs_to_docbin(gaz_train_docs, "trained_v1.docbin")

In [None]:
gaz_docs_test = list(combined_sentiment.pipe(test_docs))
write_docs_to_docbin(gaz_docs_test, "gaz_docs_test.docbin")

In [None]:
gaz_docs_train = list(utils.docbin_reader("trained_v1.docbin", spacy_model_name=spacy_model_name))

gaz_docs_test = list(utils.docbin_reader("gaz_docs_test.docbin", spacy_model_name=spacy_model_name))

In [None]:
calculate_metrics(train_true, gaz_docs_train_labels)

In [None]:
gaz_docs_train_labels = predicted_labels(gaz_docs_train)
gaz_docs_test_labels = predicted_labels(gaz_docs_test)

In [None]:
calculate_metrics(train_true, gaz_docs_test_labels)

In [None]:
train_texts = [doc.text for doc in gaz_train_docs]
train_weak_labels = gaz_docs_train_labels

test_texts = [doc.text for doc in gaz_docs_test]
test_weak_labels = gaz_docs_test

train_df = pd.DataFrame(list(zip(train_texts, train_weak_labels)),
               columns =['text', 'label'])

train_df.to_json("weak_imdb_train.jsonl", orient="records", lines=True)

test_df = pd.DataFrame(list(zip(train_texts, train_weak_labels)),
               columns =['text', 'label'])

test_df.to_json("weak_imdb_test.jsonl", orient="records", lines=True)


In [None]:
weak_train_dataset = load_dataset("json", data_files="weak_imdb_train.jsonl", split="train")

weak_test_dataset = load_dataset("json", data_files="weak_imdb_test.jsonl", split="train")


In [None]:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)




In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()