In [87]:
import re
import numpy as np
from transformers import AutoTokenizer
import tensorflow as tf
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [90]:
with open('../lecture5/sentiment.txt') as f:
    sentiment = f.readlines()
    
text_processing = []
for i in sentiment:
    text_processing.append(i.strip().lower().split('\t'))

In [93]:
data_train, data_test = train_test_split(text_processing, test_size=0.3, random_state=0)

In [98]:
dataset_train = {'text':[], 'label':[]}

for i in range(len(data_train)):
    dataset_train['text'].append(re.sub(r'[^\w\s]', '', data_train[i][0].strip()))
    dataset_train['label'].append(int(data_train[i][1]))

dataset_test = {'text':[], 'label':[]}

for i in range(len(data_test)):
    dataset_test['text'].append(re.sub(r'[^\w\s]', '', data_test[i][0].strip()))
    dataset_test['label'].append(int(data_test[i][1]))

In [99]:
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(dataset_train),
    'test': Dataset.from_dict(dataset_test)
})

In [100]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 700
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 300
    })
})

In [101]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [102]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [103]:
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [104]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [105]:
tf_train_dataset = tokenized_data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_dataset = tokenized_data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [107]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [108]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [109]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [111]:
model.fit(x=tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

Epoch 1/3
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4fad309070>