<a href="https://colab.research.google.com/github/thierrydecae/Fine-Tune-Bert/blob/main/Fine_Tune_Bert_Hugging_Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SETUP

In [None]:
! pip install transformers
! pip install -U accelerate
! pip install -U transformers
! pip install datasets
! pip install evaluate

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import numpy as np
import evaluate
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from datasets import Dataset
import tensorflow as tf
import torch
%matplotlib inline

In [None]:
# Check for GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

GET DATA

In [None]:
def twenty_newsgroup_to_df(subset):
    newsgroups = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups.data, newsgroups.target.tolist()]).T
    df.columns = ['text', 'label']

    targets = pd.DataFrame(newsgroups.target_names)
    targets.columns=['label']

    out = pd.merge(df, targets, left_on='label', right_index=True)
    out['date'] = pd.to_datetime('now')
    return out

train=twenty_newsgroup_to_df('train')
train=train[['text','label']]
test=twenty_newsgroup_to_df('test')
test=test[['text','label']]
train.head()

TOKENIZE DATA

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train.text.values]

pd.Series(seq_len).hist(bins = 30)

In [None]:
MAX_LEN = 128

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
small_train_dataset = tokenized_train.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_test.shuffle(seed=42).select(range(1000))

# Train with Pytorch

In [None]:
num_cats = len(train.label.unique())

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_cats)
training_args = TrainingArguments(output_dir="test_trainer",learning_rate =1e-5, num_train_epochs=5,evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# Apply on Test Set
trainer.evaluate(small_eval_dataset)

In [None]:
# Save the Model
model.save_pretrained('./model/')

In [None]:
# Load Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)

TRAIN WITH KERAS

In [None]:
dataset = small_train_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])

In [None]:
# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)

TRAIN WITH TF

In [None]:
dataset = small_train_dataset

In [None]:
def tokenize_dataset(data):
    return tokenizer(data["text"])

dataset = dataset.map(tokenize_dataset)

In [None]:
tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)

In [None]:
model.compile(optimizer=Adam(3e-5))
model.fit(tf_dataset)