In [1]:
# Download the Transformers library for loading the BERT Model
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 15.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 69.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [2]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch
from tqdm import tqdm
import os
os.environ["WANDB_DISABLED"] = "true"

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [4]:
# Tokenize the Dataset using BERT Tokenzer 
def get_batch_tokenizer(tokenizer, dataset):
    return tokenizer.batch_encode_plus(dataset,
                                       max_length=256,
                                       padding=True,
                                       truncation=True,
                                       add_special_tokens=True,
                                       return_attention_mask=True,
                                       return_tensors='pt')

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val
                in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
# Helper Function to claculate F1 score
def compute_metrics(p):
    prediction, labels = p
    preds_flat = np.argmax(prediction, axis=1).flatten()
    labels_flat = labels.flatten()
    f1 = f1_score(labels_flat, preds_flat, average='macro')
    return {"f1": f1}

In [16]:
# Load the Dataset  
df = pd.read_csv("/content/ecommerceDataset.csv", names=["labels", "descriptions"])
descriptions = df["descriptions"].map(str).values.tolist()
labels = df["labels"].values.tolist()

# Encode the Labels 
le = LabelEncoder()
labels = le.fit_transform(labels).tolist()

In [17]:
# Split the Dataset into Train, Valid, Test 
x_train, x_test, y_train, y_test = train_test_split(descriptions, labels, test_size=0.4, stratify=labels, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [18]:
# Load the BertTokenizer 
tokenizer = BertTokenizer.from_pretrained(
        "bert-base-uncased",
        do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
# Tokenize the Train, Valid, Test data with helper function
x_train_tokens = get_batch_tokenizer(tokenizer, x_train)
x_valid_tokens = get_batch_tokenizer(tokenizer, x_valid)
x_test_tokens = get_batch_tokenizer(tokenizer, x_test)

In [20]:
# Define the Dataset
train_dataset = Dataset(x_train_tokens, y_train)
valid_dataset = Dataset(x_valid_tokens, y_valid)
test_dataset = Dataset(x_test_tokens, y_test)

In [21]:
# Load the BertForSequenceClassification Model 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    from_tf=True,
    num_labels=4)

# Access cuda for Loading Model in GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
# Fine Turning the Hyper parameters of BERT Model
args = TrainingArguments(output_dir="output",
                            evaluation_strategy="epoch",
                            metric_for_best_model="f1",
                            save_strategy="epoch",
                            num_train_epochs=3,
                            load_best_model_at_end=True
                            )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [23]:
# Load the Model with parameters
trainer = Trainer(args=args,
                    model=model,
                    train_dataset=train_dataset,
                    eval_dataset=valid_dataset,
                    compute_metrics=compute_metrics,
                    callbacks=[EarlyStoppingCallback(
                            early_stopping_patience=3)]
                    )

In [24]:
# Training the Model
trainer.train()

***** Running training *****
  Num examples = 30255
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11346
  import sys


Epoch,Training Loss,Validation Loss,F1
1,0.2026,0.168397,0.968098
2,0.0857,0.1605,0.974793
3,0.0645,0.13276,0.977637


***** Running Evaluation *****
  Num examples = 10085
  Batch size = 8
Saving model checkpoint to output/checkpoint-3782
Configuration saved in output/checkpoint-3782/config.json
Model weights saved in output/checkpoint-3782/pytorch_model.bin
  import sys
***** Running Evaluation *****
  Num examples = 10085
  Batch size = 8
Saving model checkpoint to output/checkpoint-7564
Configuration saved in output/checkpoint-7564/config.json
Model weights saved in output/checkpoint-7564/pytorch_model.bin
  import sys
***** Running Evaluation *****
  Num examples = 10085
  Batch size = 8
Saving model checkpoint to output/checkpoint-11346
Configuration saved in output/checkpoint-11346/config.json
Model weights saved in output/checkpoint-11346/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from output/checkpoint-11346 (score: 0.9776373669228218).


TrainOutput(global_step=11346, training_loss=0.1363149576037518, metrics={'train_runtime': 4952.9548, 'train_samples_per_second': 18.325, 'train_steps_per_second': 2.291, 'total_flos': 1.194085189020672e+16, 'train_loss': 0.1363149576037518, 'epoch': 3.0})

In [25]:
# Predictions on Test Dataset
trainer = Trainer(model=model)
predictions = trainer.predict(test_dataset)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running Prediction *****
  Num examples = 10085
  Batch size = 8
  import sys


In [26]:
# Classification Report 
preds = np.argmax(predictions.predictions, axis=1).flatten()
true_vals = predictions.label_ids

print(classification_report(true_vals, preds, target_names=list(le.classes_)))

                        precision    recall  f1-score   support

                 Books       0.98      0.97      0.98      2335
Clothing & Accessories       0.99      0.99      0.99      1772
           Electronics       0.97      0.97      0.97      2111
             Household       0.98      0.98      0.98      3867

              accuracy                           0.98     10085
             macro avg       0.98      0.98      0.98     10085
          weighted avg       0.98      0.98      0.98     10085

