In [1]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from nlp import Dataset
from transformers import AutoConfig
from tqdm.notebook import tqdm
import torchtext
from torchtext import datasets

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
#torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])

In [4]:
bert_model_name='bert-base-uncased'

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name,model_max_length=256)

In [6]:
def tokenize_process(batch):
    return_map = tokenizer(batch['text'], truncation=True, padding='max_length')
    return_map['labels'] = [i-1 for i in batch['labels']]
    return return_map

In [7]:
train_df = pd.read_csv('.data/yelp_review_polarity_csv/train.csv',names=['labels','text'])
test_df = pd.read_csv('.data/yelp_review_polarity_csv/test.csv',names=['labels','text'])

In [8]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [9]:
train_dataset = train_dataset.map(tokenize_process, batched=True)
test_dataset = test_dataset.map(tokenize_process, batched=True)

HBox(children=(FloatProgress(value=0.0, max=560.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




In [10]:
train_dataset.set_format('torch', columns=['input_ids','token_type_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'token_type_ids',  'attention_mask', 'labels'])

In [11]:
model = BertForSequenceClassification.from_pretrained(bert_model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.0,
    evaluate_during_training=False,
    logging_dir='./logs',
    do_eval=False,
    do_train=True,
    save_steps=3000,
    eval_steps=5000
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [13]:
result = trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=35000.0, style=ProgressStyle(description_…

  return function(data_struct)





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=35000.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=35000.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=35000.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=35000.0, style=ProgressStyle(description_…





In [14]:
model.save_pretrained('saved_model')

In [15]:
trainer.evaluate(test_dataset)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=594.0, style=ProgressStyle(description_w…




{'eval_loss': 0.18541416830705834,
 'eval_accuracy': 0.9644473684210526,
 'eval_f1': 0.9643356827961246,
 'eval_precision': 0.9673746093956888,
 'eval_recall': 0.9613157894736842,
 'epoch': 5.0}

In [16]:
trainer.evaluate(train_dataset)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=8750.0, style=ProgressStyle(description_…




{'eval_loss': 0.06514220461575314,
 'eval_accuracy': 0.9872196428571428,
 'eval_f1': 0.9872143231022907,
 'eval_precision': 0.9876254168647481,
 'eval_recall': 0.9868035714285714,
 'epoch': 5.0}