In [1]:
import pandas as pd
import tensorflow as tf
from nltk import wordpunct_tokenize
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import ClassLabel, Sequence, Dataset

In [2]:
lab2id = {',':1,'.':2,'!':3,'NaN':0}
id2lab = {v:k for k,v in lab2id.items()}

In [3]:
df = pd.read_csv('review-Copy1.csv')
df = df.dropna(axis=0, subset=['reviewText'])
df = df[["reviewText"]]
dataset = Dataset.from_pandas(df)
# dataset = dataset.train_test_split(test_size=0.2)

In [4]:
def process(text):
    lab, tokens = [], []
    tok = wordpunct_tokenize(text['reviewText'])
    while tok[0] in lab2id.keys():
        del tok[0]
    for i in range(len(tok)):
        if tok[i] in lab2id.keys():
            lab[-1] = lab2id[tok[i]]
        else:
            lab.append(0)
            tokens.append(tok[i])
    return {'tokens':tokens, 'labels':lab}

In [5]:
dataset = dataset.map(process)

  0%|          | 0/55014 [00:00<?, ?ex/s]

In [6]:
dataset.features['labels']=Sequence(feature=ClassLabel(num_classes=4, names=['Nan', ',', '.', '!']))

In [7]:
dataset = dataset.train_test_split(test_size=0.2)

In [13]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2lab, label2id=lab2id, from_pt=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["nLabels"] = new_labels
    return tokenized_inputs

In [17]:
tok_df = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [18]:
tf_train_df = tok_df["train"].to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'nLabels', 'token_type_ids'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_df = tok_df["test"].to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'nLabels', 'token_type_ids'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`nLabels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 3
num_train_steps = len(tf_train_df) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [14]:
model.fit(
    tf_train_df,
    callbacks=[callback],
    epochs=num_epochs,
)

41

In [43]:
tok = tok_df.with_format("tf")