In [1]:
!pip3 install datasets
!pip3 install transformers
!pip3 install seqeval
!pip3 install transformers -U
!pip3 install transformers[torch]

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/486.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.6 MB/s[0m eta [36m0:

In [3]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

from datasets import load_dataset,concatenate_datasets, load_metric
import pandas as pd
import numpy as np

# 1: Load Data

In [4]:
wnut = load_dataset("wnut_17")

Downloading builder script:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

Downloading and preparing dataset wnut_17/wnut_17 to /root/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1287 [00:00<?, ? examples/s]

Dataset wnut_17 downloaded and prepared to /root/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

## 1.1: Ner_tag Explanation
Each ner_tag describes an entity. It can be one of the following: corporation, creative-work, group, location, person, and product.

B: indicates the beginning of an entity.

I: indicates a token is contained inside the same entity (e.g., the “York” token is a part of the “New York” entity).

O: indicates the token doesn’t correspond to any entity.

In [6]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
id2tag = {id: tag for id, tag in enumerate(label_list)}
id2tag

{0: 'O',
 1: 'B-corporation',
 2: 'I-corporation',
 3: 'B-creative-work',
 4: 'I-creative-work',
 5: 'B-group',
 6: 'I-group',
 7: 'B-location',
 8: 'I-location',
 9: 'B-person',
 10: 'I-person',
 11: 'B-product',
 12: 'I-product'}

In [7]:
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

## 1.2: Reorganize train & validation datasets
Our dataset is not that large. Remember, Transformers require lots of data to take advantage of their superior performance.

To solve this issue, we concatenate training and validation datasets into a single training dataset. The test dataset will remain as-is for validation purposes:

In [8]:
# merge train & validation sets
from datasets import concatenate_datasets

train_dataset = concatenate_datasets([wnut["train"],wnut["validation"]])
train_dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 4403
})

## 1.3: Training Example

In [9]:
ith_example=2

print(wnut["train"][ith_example]['tokens'])
print([id2tag[label] for label in train_dataset[ith_example]['ner_tags']])

['Pxleyes', 'Top', '50', 'Photography', 'Contest', 'Pictures', 'of', 'August', '2010', '...', 'http://bit.ly/bgCyZ0', '#photography']
['B-corporation', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


# 2: Data Preprocessing

Transformer models mostly use sub-word-based tokenizers.

During tokenization, some words could be split into two or more words. This is a standard practice because rare words could be decomposed into meaningful tokens. For example, BERT models implement by default the Byte-Pair Encoding (BPE) tokenization.

In [10]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## 2.1 Data Tokenizer Example

Notice that there are two significant issues:

The special tokens [CLS] and [SEP] are added.
The token “Pxleyes” is split into 3 sub-tokens : p, ##xley and ##es.
In other words, the tokenization creates a mismatch between the inputs and the labels. Hence, we realign tokens and labels in the following way:

Each single word token is mapped to its corresponding ner_tag.
We assign the label -100 to the special tokens [CLS] and [SEP] so the loss function ignores them. By default, PyTorch ignores the -100 value during loss calculation.
For subwords, we only label the first token of a given word. Thus, we assign -100 to other subtokens from the same word.
For example, the token Pxleyes is labeled as 1 (B-corporation). It is tokenized as [‘p’, ‘##xley’, ‘##es’] and after token alignment the labels should become [1, -100, -100]

In [11]:
index = 2

tokenized_input = tokenizer(train_dataset[index]["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [12]:
#input
print(train_dataset[index]["tokens"])

['Pxleyes', 'Top', '50', 'Photography', 'Contest', 'Pictures', 'of', 'August', '2010', '...', 'http://bit.ly/bgCyZ0', '#photography']


In [13]:
#tokenized
print(tokens)

['[CLS]', 'p', '##xley', '##es', 'top', '50', 'photography', 'contest', 'pictures', 'of', 'august', '2010', '.', '.', '.', 'http', ':', '/', '/', 'bit', '.', 'l', '##y', '/', 'b', '##gc', '##y', '##z', '##0', '#', 'photography', '[SEP]']


## 2.2: Custom Tokenization Function

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## 2.3: Tokenize Training Set

In [15]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

Map:   0%|          | 0/4403 [00:00<?, ? examples/s]

In [16]:
tokenized_train_dataset[0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  1030,
  2703,
  17122,
  2009,
  1005,
  1055,
  1996,
  3193,
  2013,
  2073,
  1045,
  1005,
  1049,
  2542,
  2005,
  2048,
  3134,
  1012,
  3400,
  2110,
  2311,
  1027,
  9686,
  2497,
  1012,
  3492,
  2919,
  4040,
  2182,
  2197,
  3944,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [17]:
id2tag[-100]='ignore'
exml=tokenized_train_dataset[2]

pd.DataFrame({
              'tokens':tokenizer.convert_ids_to_tokens(exml["input_ids"]),
              'ner_labels':exml['labels'],
              'ner_tags': [id2tag[label] for label in exml['labels']]
            })

Unnamed: 0,tokens,ner_labels,ner_tags
0,[CLS],-100,ignore
1,p,1,B-corporation
2,##xley,-100,ignore
3,##es,-100,ignore
4,top,0,O
5,50,0,O
6,photography,0,O
7,contest,0,O
8,pictures,0,O
9,of,0,O


# 3: Fine-Tuning the Model

## 3.1: Baseline Model

The most obvious choice for a baseline classifier is to tag every token with the most frequent entity throughout the entire training dataset— the O entity:

In [18]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(
              # index and its associated input id, similar to flatMap
              pd.Series(tokenized_train_dataset['input_ids']).explode(),
              # index and its associated label
              pd.Series(tokenized_train_dataset['labels']).explode().astype(str)
)

dummy_clf.score(
                pd.Series(tokenized_train_dataset['input_ids']).explode(),
                pd.Series(tokenized_train_dataset['labels']).explode().astype(str)
)

0.5888494815191806

## 3.2: Improve Baseline Model

The baseline classifier becomes less naive if we tag each token with the most frequent label of the sentence it belongs:

In [19]:
exploded_values=pd.Series(tokenized_train_dataset['labels']).explode()
exploded_values=pd.DataFrame(exploded_values,columns=['B'])

# most frequent label for each example in the training dataset.
most_frequent_elem_by_doc=pd.Series(tokenized_train_dataset['labels'])\
                            .apply(lambda x:  max(set(x), key=x.count))
most_frequent_elem_by_doc=pd.DataFrame(most_frequent_elem_by_doc,columns=list('A'))

# most frequent label for each token in the training dataset, with one row per token
df_most_freq_token=exploded_values.merge(most_frequent_elem_by_doc, how='right',
                                         left_index=True,
                                         right_index=True)

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])

0.7197897448947134

# 4: Training

## 4.1 Data Collator

takes a batch of input examples, pads them to a uniform length, and creates input tensors suitable for token classification models.

In [20]:
#Data Collator

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

## 4.2: Metrics, Evaluation

using the seqeval library to calculate metrics for named entity recognition (NER) tasks

Note: Remember, the loss function ignores all tokens tagged with -100 during training. Our evaluation function should also take into account this information.

Hence, the compute_metrics function is defined a bit differently — we calculate precision, recall, f1-score, and accuracy by ignoring everything tagged with -100:

In [21]:
metric_seqeval = load_metric("seqeval")
example = tokenized_train_dataset[2]

labels = [label_list[i] for i in example["ner_tags"]]
metric_seqeval.compute(predictions=[labels], references=[labels])

  metric_seqeval = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'corporation': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## 4.3: Model Training

Finally, we instantiate the Trainer class to fine-tune our model. Notice the usage of the EarlyStopping callback:

In [23]:
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer, AutoModelForTokenClassification, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir='./log_results',
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    warmup_steps=500,
    eval_steps=60,         # originally 60
    save_steps=60,         # originally 60
    evaluation_strategy="steps",
    load_best_model_at_end=True
)

# total 4403 data
top = 4403
# Create a new dataset object containing only the top number of rows
tokenized_train_dataset_top = tokenized_train_dataset.select(range(top))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 6)]
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
predictions, labels, _ = trainer.predict(tokenized_wnut["test"])
predictions = np.argmax(predictions, axis=2)

index = 1

# Remove ignored index (special tokens)
# Only assign label list's label when the predicted label corresponding label is not -100
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
print(true_predictions[index])
print(true_labels[index])
print(len(true_predictions[index]))
print(len(true_labels[index]))

results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
results

In [27]:
# train_df = pd.DataFrame(tokenized_train_dataset.to_pandas())

# # Extract the true labels from the tokenized_train_dataset
# true_train_labels = [
#     [label_list[l] for l in label if l != -100]
#     for label in tokenized_train_dataset["ner_tags"]
# ]

# train_df["true_labels"] = true_train_labels
# train_df["pred_labels"] = true_train_labels
# train_df["reward"] = 10.0

# from google.colab import drive

# drive.mount('/content/gdrive/')
# path = "gdrive/MyDrive/summer_IP/"

# train_df.to_csv(path + "train_df_avg_reward_" + str(10.000) + ".csv", index=False)

Mounted at /content/gdrive/


## 4.4: Create New Training Data

In [24]:
import pandas as pd

# Convert the dataset to a Pandas DataFrame
train_df = pd.DataFrame(tokenized_train_dataset.to_pandas())

# Make predictions on the tokenized_train_dataset
train_predictions, _, _ = trainer.predict(tokenized_train_dataset)
train_predictions = np.argmax(train_predictions, axis=2)

# Remove ignored index (special tokens)
# Only assign label list's label when the predicted label corresponding label is not -100
true_train_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(train_predictions, tokenized_train_dataset["ner_tags"])
]
# Extract the true labels from the tokenized_train_dataset
true_train_labels = [
    [label_list[l] for l in label if l != -100]
    for label in tokenized_train_dataset["ner_tags"]
]

# Add the true_labels column to the train_df dataframe
train_df["true_labels"] = true_train_labels
# Add the pred_labels column to the train_df dataframe
train_df["pred_labels"] = true_train_predictions
train_df

Unnamed: 0,id,tokens,ner_tags,input_ids,token_type_ids,attention_mask,labels,true_labels,pred_labels
0,0,"[@paulwalk, It, 's, the, view, from, where, I,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, ...","[101, 1030, 2703, 17122, 2009, 1005, 1055, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, -100, -100, 0, 0, -100, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-l...","[O, O, B-location, I-location, O, O, O, O, O, ..."
1,1,"[From, Green, Newsfeed, :, AHFA, extends, dead...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2013, 2665, 2739, 7959, 2098, 1024, 6289...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, -100, -100, 0, 5, -100, 0, 0, ...","[O, O, O, O, B-group, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,2,"[Pxleyes, Top, 50, Photography, Contest, Pictu...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1052, 20959, 2229, 2327, 2753, 5855, 504...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, ...","[B-corporation, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O]"
3,3,"[today, is, my, last, day, at, the, office, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2651, 2003, 2026, 2197, 2154, 2012, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]","[O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O]"
4,4,"[4Dbling, 's, place, til, monday, ,, party, pa...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1018, 18939, 2989, 1005, 1055, 2173, 186...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 9, -100, -100, 0, -100, 0, 0, 0, 0, 0, ...","[B-person, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...,...,...,...,...,...,...,...
4398,1004,"[your, sarcasm, is, goals, :, D, :, D]","[0, 0, 0, 0, 0, 0, 0, 0]","[101, 2115, 20954, 2003, 3289, 1024, 1040, 102...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 0, 0, -100]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]"
4399,1005,"[I, HATE, THIS, I, HATE, THIS, I, HATE, THIS, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1045, 5223, 2023, 1045, 5223, 2023, 1045...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4400,1006,"[For, education, ., Lol, yeah, like, my, dads,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2005, 2495, 1012, 8840, 2140, 3398, 2066...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, -100, 0, 0, 0, 0, -100, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4401,1007,"[excellent, poem, Dubbz﻿]","[0, 0, 9]","[101, 6581, 5961, 12931, 2497, 2480, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 9, -100, -100, -100]","[O, O, B-person]","[O, O, O]"


## 4.5 Create Reward

In [25]:
def compute_reward(pred_labels, true_labels):
    num_O = sum([1 for i in true_labels if i == 'O'])
    num_entity = len(true_labels) - num_O

    num_O_correct = sum([1 for (p, l) in zip(pred_labels, true_labels) if p == l and l == 'O'])
    num_entity_correct = sum([1 for (p, l) in zip(pred_labels, true_labels) if p == l and l != 'O'])

    O_rate = num_O_correct/num_O if num_O != 0 else 0
    entity_rate = num_entity_correct/num_entity if num_entity != 0 else 0

    if num_O == 0:
        reward = entity_rate * 10
        return reward
    if num_entity == 0:
        reward = O_rate * 10
        return reward

    reward = O_rate * 4 + entity_rate * 6
    return reward

In [26]:
train_df["reward"] = train_df.apply(lambda row: compute_reward(row["pred_labels"], row["true_labels"]), axis=1)
average_reward = round(np.mean(train_df["reward"]), 4)
train_df

Unnamed: 0,id,tokens,ner_tags,input_ids,token_type_ids,attention_mask,labels,true_labels,pred_labels,reward
0,0,"[@paulwalk, It, 's, the, view, from, where, I,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, ...","[101, 1030, 2703, 17122, 2009, 1005, 1055, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, -100, -100, 0, 0, -100, 0, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-l...","[O, O, B-location, I-location, O, O, O, O, O, ...",2.956522
1,1,"[From, Green, Newsfeed, :, AHFA, extends, dead...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2013, 2665, 2739, 7959, 2098, 1024, 6289...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, -100, -100, 0, 5, -100, 0, 0, ...","[O, O, O, O, B-group, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",4.000000
2,2,"[Pxleyes, Top, 50, Photography, Contest, Pictu...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1052, 20959, 2229, 2327, 2753, 5855, 504...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, ...","[B-corporation, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O]",4.000000
3,3,"[today, is, my, last, day, at, the, office, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2651, 2003, 2026, 2197, 2154, 2012, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]","[O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O]",10.000000
4,4,"[4Dbling, 's, place, til, monday, ,, party, pa...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1018, 18939, 2989, 1005, 1055, 2173, 186...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 9, -100, -100, 0, -100, 0, 0, 0, 0, 0, ...","[B-person, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O]",4.000000
...,...,...,...,...,...,...,...,...,...,...
4398,1004,"[your, sarcasm, is, goals, :, D, :, D]","[0, 0, 0, 0, 0, 0, 0, 0]","[101, 2115, 20954, 2003, 3289, 1024, 1040, 102...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 0, 0, -100]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]",10.000000
4399,1005,"[I, HATE, THIS, I, HATE, THIS, I, HATE, THIS, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1045, 5223, 2023, 1045, 5223, 2023, 1045...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",10.000000
4400,1006,"[For, education, ., Lol, yeah, like, my, dads,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 2005, 2495, 1012, 8840, 2140, 3398, 2066...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, -100, 0, 0, 0, 0, -100, 0, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",10.000000
4401,1007,"[excellent, poem, Dubbz﻿]","[0, 0, 9]","[101, 6581, 5961, 12931, 2497, 2480, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 9, -100, -100, -100]","[O, O, B-person]","[O, O, O]",4.000000


# 5: Save file

In [27]:
from google.colab import drive

drive.mount('/content/gdrive/')
path = "gdrive/MyDrive/summer_IP/"

train_df.to_csv(path + "train_df_avg_reward_" + str(average_reward) + ".csv", index=False)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [28]:
trainer.evaluate()

{'eval_loss': 0.23672610521316528,
 'eval_precision': 0.5230769230769231,
 'eval_recall': 0.40963855421686746,
 'eval_f1': 0.45945945945945943,
 'eval_accuracy': 0.9459621221837459,
 'eval_runtime': 7.1894,
 'eval_samples_per_second': 179.014,
 'eval_steps_per_second': 2.921,
 'epoch': 3.26}

In [29]:
def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()])
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [30]:
text = """Celebrities and tourists from United States are
flooding into Greece. But a harsh winter isn't far off"""

print(tag_sentence(text))

           word         tag
0         [CLS]           O
1   celebrities           O
2           and           O
3      tourists           O
4          from           O
5        united  B-location
6        states  B-location
7           are           O
8      flooding           O
9          into           O
10       greece  B-location
11            .           O
12          but           O
13            a           O
14        harsh           O
15       winter           O
16          isn           O
17            '           O
18            t           O
19          far           O
20          off           O
21        [SEP]           O


In [31]:
text="""Apple unveils all-new MacBook Air,
supercharged by the new M2 chip"""

print(tag_sentence(text))

         word            tag
0       [CLS]              O
1       apple  B-corporation
2          un              O
3        ##ve              O
4       ##ils              O
5         all              O
6           -              O
7         new              O
8         mac      B-product
9      ##book      I-product
10        air      I-product
11          ,              O
12      super              O
13  ##charged              O
14         by              O
15        the              O
16        new              O
17         m2      B-product
18       chip              O
19      [SEP]      B-product


In [32]:
!rm -r log_results/

In [33]:
tokenized_train_dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4403
})

In [2]:
from datasets import load_dataset,concatenate_datasets, load_metric


# Step 2: Create a Pairwise dataset
dataset = load_dataset("CarperAI/openai_summarize_comparisons", split="train")
dataset = dataset.select(range(100))


Found cached dataset parquet (/Users/evan/.cache/huggingface/datasets/CarperAI___parquet/CarperAI--openai_summarize_comparisons-79d2c222a15dc8fb/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [3]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 100
})

In [4]:
dataset[1]


{'prompt': 'SUBREDDIT: r/relationships\nTITLE: To admit or not to admit snooping...\nPOST: I [25M] have snooped in the past and copped up to it to my gf [25F] of 6 years.  We talked it through.  It had been a year or two since the last time.  That\'s an issue I\'m working on.\n\nNow she has a new close male work friend.  I won\'t go into details, but she hides things from me with him and does other things to make me a bit suspicious.  So...I snooped again, and this time, all texts from her new friend have been deleted and I saw a google search for "how to get over a guy" near some searches of his name and views of his Facebook profile.\n\nI asked her about this guy, not mentioning the snooping, and she denied any feelings, we talked for a long time about our relationship and she insisted that she only loves me and I mean the world to her, and that she really wants to work towards getting this relationship back out of the rut we\'ve been in (we both work all the time and barely see each

In [23]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Save the DataFrame to a CSV file
df.to_csv("imdb.csv", index=False)

Found cached dataset imdb (/Users/evan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [17]:
from transformers import AutoTokenizer

# Download the GPT-2 tokenizer files and save them locally
tokenizer_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.save_pretrained(f"{tokenizer_name}_tokenizer_local")

('gpt2_tokenizer_local/tokenizer_config.json',
 'gpt2_tokenizer_local/special_tokens_map.json',
 'gpt2_tokenizer_local/vocab.json',
 'gpt2_tokenizer_local/merges.txt',
 'gpt2_tokenizer_local/added_tokens.json',
 'gpt2_tokenizer_local/tokenizer.json')

In [18]:
from transformers import AutoTokenizer

# Load the GPT-2 tokenizer from a local directory
tokenizer_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(f"{tokenizer_name}_tokenizer_local")

In [19]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2_tokenizer_local', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [20]:
from transformers import AutoTokenizer, AutoModel

# Download and save the GPT-2 model locally
model_name = "gpt2"
model = AutoModel.from_pretrained(model_name)
model.save_pretrained(f"{model_name}_local")

In [21]:
# Load the GPT-2 tokenizer and local model
tokenizer = AutoTokenizer.from_pretrained(model_name)
local_model = AutoModel.from_pretrained(f"{model_name}_local")

In [22]:
local_model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [24]:
from transformers import AutoModelForCausalLM

model_name = "gpt2"

# Save model 
model = AutoModelForCausalLM.from_pretrained(model_name)  
model.save_pretrained(f"{model_name}_CLM_local")

# Reload as CausalLM
model = AutoModelForCausalLM.from_pretrained(f"{model_name}_CLM_local")

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [25]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)