<a href="https://colab.research.google.com/github/felixbmuller/nlp-commonsense/blob/main/NLP_Commonsense_Assignment_2_KB_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Commonsense Assignment 2 - Knowledge Base Model

## Setup

In [1]:
!pip install -q transformers datasets torch torchvision
!apt install git-lfs >/dev/null

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h



In [2]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset, load_metric
import pandas as pd
import transformers

print(transformers.__version__)

model_checkpoint = "bert-base-uncased"
batch_size = 16

datasets = load_dataset("super_glue", "copa")

4.26.1


Downloading builder script:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

Downloading and preparing dataset super_glue/copa to /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/44.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Setup and Test Knowledge Base

In [4]:

import utils
import process_examples
import find_shortest_path
import renderer as R
import qa_preprocessing as QA

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
conceptnet = utils.load_conceptnet(load_compressed=True)

In [6]:
example = datasets["train"][0]

example

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0}

In [7]:
print(process_examples.extract_terms(example["premise"]))
print(process_examples.extract_terms(example["choice1"]))
print(process_examples.extract_terms(example["choice2"]))
print(find_shortest_path.find_word_path('body', 'sun', conceptnet))
print(find_shortest_path.find_word_path('body', 'sun', conceptnet, renderer=None))

{'my body', 'cast', 'grass', 'body', 'shadow'}
{'sun', 'wa', 'rising'}
{'wa', 'cut', 'grass'}
body <--RelatedTo-- sun
[182090, 1539020]


In [8]:
R.render_path_natural([], conceptnet)

('', [])

In [9]:
R.render_path_natural([182090, 1539020], conceptnet)

('sun is like body.', [0.909])

In [10]:
print(QA.get_knowledge_for_example(example["premise"], example["choice1"], conceptnet, max_paths=100))
print(QA.get_knowledge_for_example(example["premise"], example["choice1"], conceptnet, max_paths=3))

iron can be cast . sun has iron. cast is like rise. rising and rise have similar meanings. grass is like plant. sun is like plant. grass is like side. side is like wa. grass is in the context of slang. rising is in the context of slang. sun is like body. wash is like body. wash and wa have similar meanings. dyke is like body. dyke is like rising. shadow is like sun.
sun is like body. shadow is like sun. cast is like rise. rising and rise have similar meanings.


## Preprocessing the data

In [11]:
from tqdm.notebook import tqdm

In [12]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
ending_names = ["choice1", "choice2"]

QUESTION_MAP = {
    "cause": "What was the cause of this?",
    "effect": "What happened as a RESULT?",
}

MAX_PATHS = 3 # only take the three most relevant knowledge paths into account 

def preprocess_function(examples):
    # Repeat premise and question twice for both possible answers
    # for each repetitions, add knowledge from the knowledge base in front of 
    # the premise. The knowledge added is about connections between the premise 
    # and the answer choice. The type of question (cause/effect) is also taken
    # into account
    first_sentences = [
                       [f"{QA.get_knowledge_for_example(f'{context} {question}', c1, conceptnet, MAX_PATHS)} {context} {QUESTION_MAP[question]}", 
                        f"{QA.get_knowledge_for_example(f'{context} {question}', c2, conceptnet, MAX_PATHS)} {context} {QUESTION_MAP[question]}"] 
                       for context, question, c1, c2 in zip(
                           tqdm(examples["premise"]), 
                           examples["question"], 
                           examples["choice1"], 
                           examples["choice2"]
                           )
                       ]
    # Grab all second sentences possible for each context.
    second_sentences = [[c1, c2] 
                        for c1, c2 in zip(examples["choice1"], examples["choice2"])]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    if not len(first_sentences) == len(second_sentences):
        raise ValueError("lengths dont match")
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists of lists for each key: a list of all examples (here 5), then a list of all choices (4) and a list of input IDs (length varying here since we did not apply any padding):

### Test Tokenizer and Preprocessing

In [14]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
examples = datasets["train"][:2]
features = preprocess_function(examples)

print(features.keys())
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

  0%|          | 0/2 [00:00<?, ?it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
2 2 [46, 38]


To check we didn't do anything group when grouping all possibilites then unflattening, let's have a look at the decoded inputs for a given example:

In [16]:
len(datasets["train"]), len(datasets["test"]), len(datasets["validation"])

(400, 500, 100)

In [18]:
idx = 1
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(2)]

["[CLS] difficult is like hard. hard time is like difficult. steel is like cause. steel is like hard. the woman tolerated her friend's difficult behavior. what was the cause of this? [SEP] the woman knew her friend was going through a hard time. [SEP]",
 "[CLS] cause is like action. kindness is action. easiness is behavior. easiness is like kindness. friend is like good. good is advantage. the woman tolerated her friend's difficult behavior. what was the cause of this? [SEP] the woman felt that her friend took advantage of her kindness. [SEP]"]

We can compare it to the ground truth:

In [19]:
datasets["train"][3]

{'premise': 'The runner wore shorts.',
 'choice1': 'The forecast predicted high temperatures.',
 'choice2': 'She planned to run along the beach.',
 'question': 'cause',
 'idx': 3,
 'label': 0}

### Apply Preprocessing to the Whole Dataset

Applying the preprocessing including querying the knowledge base takes around 15 seconds per example. To avoid lengthy calulcations at every execution, this sections allows to save/retrieve results using Google drive. We do not apply preprocessing to the test set, as it is not needed anyways.

In [20]:
import joblib
import pyarrow as pa
from datasets import Dataset, DatasetDict, concatenate_datasets

use_gdrive = False

In [23]:
encoded_val = preprocess_function(datasets["validation"])
joblib.dump(encoded_val, "./copa_val.joblib")

  0%|          | 0/100 [00:00<?, ?it/s]

['./copa_val.joblib']

In [24]:
encoded_train = preprocess_function(datasets["train"])
joblib.dump(encoded_train, "./copa_train.joblib")

  0%|          | 0/400 [00:00<?, ?it/s]

['./copa_train.joblib']

In [25]:
encoded_val = joblib.load("./copa_val.joblib")
encoded_train = joblib.load("./copa_train.joblib")

In [26]:
train_ds = Dataset(pa.Table.from_pydict(encoded_train))
val_ds = Dataset(pa.Table.from_pydict(encoded_val))

In [27]:
# merge tokenizer output with labels from the original dataset
train_ds = concatenate_datasets([train_ds, datasets["train"]], split="train", axis=1)
val_ds = concatenate_datasets([val_ds, datasets["validation"]], split="validation", axis=1)


In [28]:
encoded_datasets = DatasetDict(
    train=train_ds,
    validation=val_ds)

**Add Sorting**

The following code can be used to sort the datasets according to the average number of tokens (average is needed because each datapoint contains two sequences, one for choice 1 and one for choice 2). As this gave worse results, I did not use this in the final solution.

In [29]:
def avg_input_lens(batch):
    vals = [(len(v[0]) + len(v[1]))/2 for v in batch["input_ids"]]
    return {"avg_input_len": vals}

# Uncomment to apply sorting
#encoded_datasets = encoded_datasets.map(avg_input_lens, batched=True)
#encoded_datasets = encoded_datasets.sort("avg_input_len")

In [30]:
s0 = pd.Series(len(encoded_datasets["train"]["input_ids"][i][0]) for i in range(400))
s1 = pd.Series(len(encoded_datasets["train"]["input_ids"][i][1]) for i in range(400))

len_df = pd.DataFrame({"input_ids0": s0, "input_ids1": s1})

In [31]:
len_df

Unnamed: 0,input_ids0,input_ids1
0,46,38
1,53,63
2,60,66
3,52,42
4,42,41
...,...,...
395,54,62
396,62,68
397,49,49
398,68,51


In [32]:
encoded_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 100
    })
})

## Fine-tuning the model

In [34]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-copa-kb",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_m

When called on a list of examples, it will flatten all the inputs/attentions masks etc. in big lists that it will pass to the `tokenizer.pad` method. This will return a dictionary with big tensors (of shape `(batch_size * 4) x seq_length`) that we then unflatten.

### Test Collator

We can check this data collator works on a list of features, we just have to make sure to remove all features that are not inputs accepted by our model (something the `Trainer` will do automatically for us after):

In [35]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Again, all those flatten/un-flatten are sources of potential errors so let's make another sanity check on our inputs:

In [36]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(2)]

['[CLS] malpractice is like patient. malpractice is like physician. patient is case. case and lawsuit have similar meanings. the physician misdiagnosed the patient. what happened as a result? [SEP] the patient filed a malpractice lawsuit against the physician. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] patient is people. people has information. examining room is like patient. examining room is like physician. malpractice is like physician. malpractice is like patient. the physician misdiagnosed the patient. what happened as a result? [SEP] the patient disclosed confidential information to the physician. [SEP] [PAD] [PAD] [PAD] [PAD]']

In [37]:
encoded_datasets["train"][8]

{'input_ids': [[101,
   15451,
   18098,
   28804,
   2063,
   2003,
   2066,
   5776,
   1012,
   15451,
   18098,
   28804,
   2063,
   2003,
   2066,
   7522,
   1012,
   5776,
   2003,
   2553,
   1012,
   2553,
   1998,
   9870,
   2031,
   2714,
   15383,
   1012,
   1996,
   7522,
   28616,
   9032,
   26745,
   6924,
   1996,
   5776,
   1012,
   2054,
   3047,
   2004,
   1037,
   2765,
   1029,
   102,
   1996,
   5776,
   6406,
   1037,
   15451,
   18098,
   28804,
   2063,
   9870,
   2114,
   1996,
   7522,
   1012,
   102],
  [101,
   5776,
   2003,
   2111,
   1012,
   2111,
   2038,
   2592,
   1012,
   12843,
   2282,
   2003,
   2066,
   5776,
   1012,
   12843,
   2282,
   2003,
   2066,
   7522,
   1012,
   15451,
   18098,
   28804,
   2063,
   2003,
   2066,
   7522,
   1012,
   15451,
   18098,
   28804,
   2063,
   2003,
   2066,
   5776,
   1012,
   1996,
   7522,
   28616,
   9032,
   26745,
   6924,
   1996,
   5776,
   1012,
   2054,
   3047,
   2004,
   10

### Run Training

In [None]:
trainer.train()

#model.push_to_hub("felixbmuller/bert-base-uncased-finetuned-copa")

The following columns in the training set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, premise, question, idx, choice2.
***** Running training *****
  Num examples = 400
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 75


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.683946,0.54
2,No log,0.657442,0.61
3,No log,0.631907,0.61


The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, premise, question, idx, choice2.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, premise, question, idx, choice2.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, premise, question, idx, choice2.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=75, training_loss=0.619866943359375, metrics={'train_runtime': 85.674, 'train_samples_per_second': 14.007, 'train_steps_per_second': 0.875, 'total_flos': 89802285776832.0, 'train_loss': 0.619866943359375, 'epoch': 3.0})

## Evalute the Model


In [38]:
predictions, label_ids, metrics = trainer.predict(encoded_datasets["validation"], metric_key_prefix="val")

The following columns in the test set don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: idx, question, choice2, choice1, premise. If idx, question, choice2, choice1, premise are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


In [39]:
metrics

{'val_loss': 0.6887620687484741,
 'val_accuracy': 0.5699999928474426,
 'val_runtime': 139.3069,
 'val_samples_per_second': 0.718,
 'val_steps_per_second': 0.05}

In [40]:
val = pd.DataFrame(datasets["validation"])
val["label_ids"] = label_ids
val["pred0"] = predictions[:, 0]
val["pred1"] = predictions[:, 1]
val["pred_label"] = np.argmax(predictions, axis=1)

Sanity check to ensure that predictions work the way I expect them to do

In [42]:
joblib.dump(val, "./bert-base-uncased-finetuned-copa-kb-validation-results.joblib")

['./bert-base-uncased-finetuned-copa-kb-validation-results.joblib']

In [43]:
import joblib
val = joblib.load("./bert-base-uncased-finetuned-copa-kb-validation-results.joblib")

In [45]:
val.head(20)

Unnamed: 0,premise,choice1,choice2,question,idx,label,label_ids,pred0,pred1,pred_label
0,The man turned on the faucet.,The toilet filled with water.,Water flowed from the spout.,effect,0,1,1,0.153684,0.149661,0
1,The girl found a bug in her cereal.,She poured milk in the bowl.,She lost her appetite.,effect,1,1,1,0.134195,0.148613,1
2,The woman retired.,She received her pension.,She paid off her mortgage.,effect,2,0,0,0.143218,0.068474,0
3,I wanted to conserve energy.,I swept the floor in the unoccupied room.,I shut off the light in the unoccupied room.,effect,3,1,1,0.137576,-0.122737,0
4,The hamburger meat browned.,The cook froze it.,The cook grilled it.,cause,4,1,1,0.122231,0.142994,1
5,I doubted the salesman's pitch.,I turned his offer down.,He persuaded me to buy the product.,effect,5,0,0,0.121666,0.1308,1
6,I decided to stay home for the night.,The forecast called for storms.,My friends urged me to go out.,cause,6,0,0,0.148781,0.049926,0
7,My eyes became red and puffy.,I was sobbing.,I was laughing.,cause,7,0,0,0.114868,0.134569,1
8,The flame on the candle went out.,I blew on the wick.,I put a match to the wick.,cause,8,0,0,0.138676,0.125721,0
9,The man drank heavily at the party.,He had a headache the next day.,He had a runny nose the next day.,effect,9,0,0,0.008865,0.082216,1


In [46]:
wrong_samples = val[val.label !=  val.pred_label]
wrong_samples.sample(25, random_state=42)

Unnamed: 0,premise,choice1,choice2,question,idx,label,label_ids,pred0,pred1,pred_label
93,The woman became famous.,Photographers followed her.,Her family avoided her.,effect,93,0,0,0.113635,0.116983,1
62,The lock opened.,I turned the key in the lock.,I made a duplicate of the key.,cause,62,0,0,0.0214,0.143647,1
63,I put rubber gloves on.,I was preparing to wash my hands.,I was preparing to clean the bathroom.,cause,63,1,1,0.134537,0.133526,0
90,I lit the candle.,Wax dripped off the candle.,The wax on the candle hardened.,effect,90,0,0,0.132564,0.133346,1
85,The woman read the newspaper.,She discovered the outcome of the election.,She casted a vote in the election.,effect,85,0,0,0.136897,0.146086,1
96,I tidied up my house.,I was swamped with work.,I was expecting company.,cause,96,1,1,0.15809,0.069923,0
9,The man drank heavily at the party.,He had a headache the next day.,He had a runny nose the next day.,effect,9,0,0,0.008865,0.082216,1
30,The secretary put the caller on hold.,The caller's phone lost reception.,The caller waited on the line.,effect,30,1,1,0.143084,0.139558,0
18,The cook's eyes watered.,He ran out of onions.,He cut an onion.,cause,18,1,1,0.137863,0.132948,0
7,My eyes became red and puffy.,I was sobbing.,I was laughing.,cause,7,0,0,0.114868,0.134569,1


# Calculate t-test

In [48]:
baseline = {
    "1": [73, 25, 26, 3, 42, 30, 9, 89],
    "2": [91, 70, 65, 52, 98],
    "3": [35, 8],
    "4": [82, 14, 86],
    "5": [38, 49, 97, 10, 36, 4, 55],
}

kb_model = {
    "1": [94, 27, 19, 30, 71, 25, 3, 33],
    "2": [52, 28, 98, 62, 83, 0],
    "3": [54, 8, 35, 59],
    "4": [14, 82, 17],
    "5": [38, 55, 10, 63],
}

In [49]:
baseline_vec = {k: [(1 if i in v else 0) for i in range(100)] for k, v in baseline.items()}
kb_model_vec = {k: [(1 if i in v else 0) for i in range(100)] for k, v in kb_model.items()}

In [50]:
print({k: sum(v)*4 for k, v in baseline_vec.items()})
print({k: sum(v)*4 for k, v in kb_model_vec.items()})

{'1': 32, '2': 20, '3': 8, '4': 12, '5': 28}
{'1': 32, '2': 24, '3': 16, '4': 12, '5': 16}


In [51]:
from scipy.stats import ttest_rel, ttest_ind

In [53]:
for k in baseline.keys():
    print(f"{k}: stat, probability: {ttest_rel(baseline_vec[k], kb_model_vec[k])}")

1: stat, probability: TtestResult(statistic=0.0, pvalue=1.0, df=99)
2: stat, probability: TtestResult(statistic=-0.3763388118272598, pvalue=0.7074703580131823, df=99)
3: stat, probability: TtestResult(statistic=-1.4214106244380287, pvalue=0.15833990565972564, df=99)
4: stat, probability: TtestResult(statistic=0.0, pvalue=1.0, df=99)
5: stat, probability: TtestResult(statistic=1.3470946333202294, pvalue=0.18102514023295704, df=99)
