## Story Cloze: Bert for Sequence Classification

#### install tools

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# load training, validation, and test datasets
train_df = pd.read_json('/content/drive/MyDrive/Project/ROCStories_winter2017_mixed.json')
val_df = pd.read_csv('/content/drive/MyDrive/Project/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Project/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv')

### Modify Training Test and Validation Datasets

In [6]:
def modify_training(data):
  new = pd.DataFrame()
  new['stories'] = data.story
  new['ending0'] = data.ending0
  new['ending1'] = data.ending1
  new['labels'] = data.label
  ntest = new.melt(id_vars=['stories','labels'], value_vars=['ending0', 'ending1'], var_name='ending_number', value_name='endings')
  check_last_index_match = lambda row: 1 if row['ending_number'][-1] == str(row['labels']) else 0
  ntest['label']=ntest.apply(check_last_index_match, axis=1)
  ntest.drop(['labels','ending_number'],inplace=True,axis=1)
  return ntest.sample(frac=1,replace=False).reset_index(drop=True)

def modify_validation(val_df):
  validation = pd.DataFrame()
  validation['stories'] = val_df.iloc[:,1:5].apply(lambda row: ' '.join(row), axis=1)
  validation['ending1'] = val_df.iloc[:,-3]
  validation['ending2'] = val_df.iloc[:,-2]
  validation['labels'] = val_df.iloc[:,-1]
  vtest = validation.melt(id_vars=['stories','labels'], value_vars=['ending1', 'ending2'], var_name='ending_number', value_name='endings')
  check_last_index_match = lambda row: 1 if row['ending_number'][-1] == str(row['labels']) else 0
  vtest['label']=vtest.apply(check_last_index_match, axis=1)
  vtest.drop(['labels','ending_number'],inplace=True,axis=1)
  return vtest.sample(frac=1,replace=False).reset_index(drop=True)



In [7]:
### turn all train, validation, and test datasets into a dataset dictionary ###
test_df['label'] = [-1] * len(test_df)

train_mod = modify_training(train_df)
val_mod = modify_validation(val_df)
test_mod = modify_validation(test_df)
test_mod['label'] = [-1] * len(test_mod)


train_dataset = Dataset.from_dict(train_mod)
val_dataset = Dataset.from_dict(val_mod)
test_dataset = Dataset.from_dict(test_mod)
data_dict = datasets.DatasetDict({"train":train_dataset,
                                        "validation" : val_dataset,
                                        "test":test_dataset})

## Look at some examples

In [8]:
def show_one(example):
    print(f"Story: {example['stories']}")
    print(f"\nEnding: {example['endings']}")
    print(f"\nLabel: {example['label']}")

In [9]:
show_one(data_dict['train'][9])

Story: Philip hid behind a bush. A friend walked by the bush. Philip jumped out of the bush to scare his friend. His friend punched Philip in fear.

Ending: The whole car ride, Paul worried what others drivers were thinking.

Label: 0


In [10]:
show_one(data_dict['validation'][99])

Story: I made a resolution to quit my job. I wrote my letter of resignation. I quietly went to the office and turned it in. I quit my job.

Ending: I go back to work at my job tomorrow.

Label: 0


## Preprocess

In [11]:
### load tokenizer ###
from transformers import AutoTokenizer
checkpoint = '/content/drive/MyDrive/Project/bert_model_weights'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#### Tokenize each sentence

In [12]:
def preprocess_function(examples):
    return tokenizer(examples['stories'], examples['endings'], truncation=True, padding='longest',return_tensors='tf')

In [13]:
pre_tokenizer_columns = set(data_dict["train"].features)
encoded_dataset = data_dict.map(preprocess_function, batched=True)
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)

Map:   0%|          | 0/105330 [00:00<?, ? examples/s]

Map:   0%|          | 0/3142 [00:00<?, ? examples/s]

Map:   0%|          | 0/3142 [00:00<?, ? examples/s]

Columns added by tokenizer: ['attention_mask', 'input_ids', 'token_type_ids']


#### Look at some tokenization and preprocessing outputs

In [14]:
examples = data_dict["train"][:3]
features = preprocess_function(examples)
print(
    len(features["input_ids"]),
    len(features["input_ids"][0]),
    [len(x) for x in features["input_ids"]],
)

3 59 [59, 59, 59]


In [15]:
[tokenizer.decode(features["input_ids"][x]) for x in range(len(features))]

['[CLS] sam had never been to the beach. he was dying of cancer. he wanted to go before he died. his friends wanted to take him. [SEP] she wanted to do a great job. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] jenny's daughter woke up in the middle of the night. she climbed into bed with jenny and her husband. there wasn't enough room in the bed. the young girl fell out of the bed and hit her head. [SEP] jenny stopped letting her daughter get into her bed. [SEP]",
 "[CLS] jimmy decided he didn't want anymore kids. he went to the doctor to get a consult. jimmy and the doctor decided a vasectomy fit his needs. he went back to get it done. [SEP] the monster at the door was horrifying. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]"]

## Load Model

In [16]:
from transformers import TFBertForSequenceClassification

id2label = {0: "Invalid", 1: "Valid"}
label2id = {val: key for key, val in id2label.items()}

model = TFBertForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model.config.dropout = 0.3
model.config.hidden_dropout_prob = 0.3

In [18]:
tf_train_dataset = model.prepare_tf_dataset(
    encoded_dataset["train"],
    shuffle=True,
    batch_size=16,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    encoded_dataset["validation"],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer,
)

#### Evaluation Metrics

In [19]:
import evaluate
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
from transformers import create_optimizer

num_epochs = 3
batch_size = 16
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=3e-05, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

In [22]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
)

In [23]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs,
    callbacks=[metric_callback],
)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f5e4d0b4880>

In [25]:
index = np.random.randint(0, len(test_df)-2)
sentences = [
    test_df.iloc[index:index+1,1:6].apply(lambda row: ' '.join(row), axis=1)[index],
    test_df.iloc[index:index+1,[1,2,3,4,6]].apply(lambda row: ' '.join(row), axis=1)[index]
             ]
print(sentences[0])
print(sentences[1])

Jack needed to get water for his cows. The well at his home was broken. Jack looked for a working well. Jack found a working will atop a hill in the city park. Jack had to ask permission to use the well from its owner.
Jack needed to get water for his cows. The well at his home was broken. Jack looked for a working well. Jack found a working will atop a hill in the city park. Jack got water and gave it to his cows.


In [26]:
tokenized = tokenizer(sentences, return_tensors="tf", padding="longest")

logits = model(tokenized).logits
print(logits)
classifications = np.argmax(logits, axis=1)
print(classifications)

classifications = [model.config.id2label[output] for output in classifications]
print(classifications)

tf.Tensor(
[[ 1.6451956  -1.8512077 ]
 [ 0.4198412  -0.29284844]], shape=(2, 2), dtype=float32)
[0 0]
['Invalid', 'Invalid']


In [29]:
model.save('/content/drive/MyDrive/Project/loaded_model.keras')



In [28]:
# adding drop out layer lowers validation loss and increases accuracy