## Story Cloze: Bert for Sequence Classification

#### install tools

In [None]:
!pip install datasets



In [None]:
!pip install huggingface_hub



In [None]:
!pip install evaluate



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load training, validation, and test datasets
val_df = pd.read_csv('/content/drive/MyDrive/Project/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Project/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv')
train_df = pd.read_csv('/content/drive/MyDrive/Project/Copy of winter_2017_labeled_p1.csv').drop('Unnamed: 0',axis=1)


### Modify Training Test and Validation Datasets

In [None]:
def modify_training(data):
  new = pd.DataFrame()
  paragraphs = data.iloc[:,2:6].apply(lambda row: ' '.join(row), axis=1)
  new['paragraphs'] = pd.concat([paragraphs,paragraphs])
  new['endings'] = pd.concat([data.correctE, data.incorrectE])
  new['label'] = ([1] * len(data)) + ([0] * len(data))
  return new.sample(frac=1,replace=False).reset_index(drop=True)

def modify_validation(val_df):
  validation = pd.DataFrame()
  validation['paragraphs'] = val_df.iloc[:,1:5].apply(lambda row: ' '.join(row), axis=1)
  validation['ending1'] = val_df.iloc[:,-3]
  validation['ending2'] = val_df.iloc[:,-2]
  validation['labels'] = val_df.iloc[:,-1]
  vtest = validation.melt(id_vars=['paragraphs','labels'], value_vars=['ending1', 'ending2'], var_name='ending_number', value_name='endings')
  check_last_index_match = lambda row: 1 if row['ending_number'][-1] == str(row['labels']) else 0
  vtest['label']=vtest.apply(check_last_index_match, axis=1)
  vtest.drop(['labels','ending_number'],inplace=True,axis=1)
  return vtest.sample(frac=1,replace=False).reset_index(drop=True)



In [None]:
### turn all train, validation, and test datasets into a dataset dictionary ###
test_df['label'] = [-1] * len(test_df)

train_mod = modify_training(train_df)
val_mod = modify_validation(val_df)
test_mod = modify_validation(test_df)
test_mod['label'] = [-1] * len(test_mod)


train_dataset = Dataset.from_dict(train_mod)
val_dataset = Dataset.from_dict(val_mod)
test_dataset = Dataset.from_dict(test_mod)
data_dict = datasets.DatasetDict({"train":train_dataset,
                                        "validation" : val_dataset,
                                        "test":test_dataset})

## Look at some examples

In [None]:
def show_one(example):
    print(f"Context: {example['paragraphs']}")
    print(f"  A - {example['endings']}")
    print(f"\nLabel: {example['label']}")

In [None]:
show_one(data_dict['validation'][900])

Context: Barry loved to lift weights. He only used his special barbell to lift weights. One day he forgot to bring his barbell! He decided to do cardio instead of lifting.
  A - He always enjoyed a nice nap in his hammock.

Label: 0


## Preprocess

In [None]:
### load tokenizer ###
from transformers import AutoTokenizer
checkpoint = "ydshieh/bert-base-uncased-yelp-polarity"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

#### Tokenize each sentence

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['paragraphs'], examples['endings'], truncation=True, padding='longest',return_tensors='tf')

In [None]:
pre_tokenizer_columns = set(data_dict["train"].features)
encoded_dataset = data_dict.map(preprocess_function, batched=True)
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)

Map:   0%|          | 0/26332 [00:00<?, ? examples/s]

Map:   0%|          | 0/3142 [00:00<?, ? examples/s]

Map:   0%|          | 0/3142 [00:00<?, ? examples/s]

Columns added by tokenizer: ['attention_mask', 'input_ids', 'token_type_ids']


In [None]:
encoded_dataset["train"].features["label"]

Value(dtype='int64', id=None)

#### Look at some tokenization and preprocessing outputs

In [None]:
examples = data_dict["train"][:3]
features = preprocess_function(examples)
print(
    len(features["input_ids"]),
    len(features["input_ids"][0]),
    [len(x) for x in features["input_ids"]],
)

3 75 [75, 75, 75]


In [None]:
[tokenizer.decode(features["input_ids"][x]) for x in range(len(features))]

["[CLS] red's dad was a very successful business owner. he owned the world's most famous coffee shop. red hoped one day he would be left the company. when his father died unexpectedly he assumed he would take control. [SEP] red's father left him the coffee shop, but he decided to sell it and pursue his dream of becoming a rodeo clown instead. [SEP]",
 "[CLS] betty doesn't like the area she lives in. she's been looking at real estate listing online. she found a house she really likes in coos bay, oregon. betty bought the house site - unseen. [SEP] when she arrived, the house was in much worse shape than shown. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",
 '[CLS] i saw this really nice car the other day. it was driving the highway. it had this awesome metallic paint color. i have no idea what kind of car it was. [SEP] i later found out it was a spaceship from outer space. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

## Load Model

In [None]:
from transformers import TFBertForSequenceClassification

id2label = {0: "Invalid", 1: "Valid"}
label2id = {val: key for key, val in id2label.items()}

model = TFBertForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

tf_model.h5:   0%|          | 0.00/438M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ydshieh/bert-base-uncased-yelp-polarity.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    encoded_dataset["train"],
    shuffle=True,
    batch_size=16,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    encoded_dataset["validation"],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer,
)

#### Evaluation Metrics

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import create_optimizer

num_epochs = 3
batch_size = 16
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=5e-05, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_dataset
)

In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs,
    callbacks=[metric_callback],
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b62776367a0>

In [None]:
input_start = "Dan's parents were overweight. Dan was overweight as well. The doctors told his parents it was unhealthy. His parents understood and decided to make a change."
endings = ['They got themselves and Dan on a diet.', "apogjo sidjgoiers dngo a;n."]
sentences = [f"{input_start} {ending}" for ending in endings]

In [None]:
tokenized = tokenizer(sentences, return_tensors="tf", padding="longest")

logits = model(tokenized).logits
print(logits)
classifications = np.argmax(logits, axis=1)
print(classifications)

classifications = [model.config.id2label[output] for output in classifications]
print(classifications)

tf.Tensor(
[[-2.6659105  3.1032088]
 [-2.8473575  3.2708547]], shape=(2, 2), dtype=float32)
[1 1]
['Valid', 'Valid']
