In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
import os
os.listdir('./')

['.config', 'drive', 'sample_data']

In [5]:
df_dataset = pd.read_csv('./drive/MyDrive/hf/Datafiniti_Hotel_Reviews_Jun19.csv')
df_dataset['full_review'] = df_dataset['reviews.title']+' '+df_dataset['reviews.text']
df_dataset = df_dataset[~df_dataset.full_review.isna()]
df_dataset['full_review'] = df_dataset.full_review.apply(lambda x:' '.join(x.split(' ')))
df_dataset['label'] = np.where(df_dataset['reviews.rating']>4, 1, 0)
df_dataset.rename(columns={'full_review':'text'}, inplace=True)
df_dataset = df_dataset[['label','text']].sample(100)
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 4265 to 1761
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   100 non-null    int64 
 1   text    100 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.3+ KB


In [6]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_dataset, test_size=0.2, random_state=42)
df_train.shape, df_test.shape 

((80, 2), (20, 2))

In [7]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler, AutoTokenizer

model = "bert-base-cased"#"prajjwal1/bert-tiny"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [8]:
from datasets import Dataset, DatasetDict

train = Dataset.from_pandas(df_train.reset_index(drop=True))
validation = Dataset.from_pandas(df_test.reset_index(drop=True))

ds = DatasetDict()

ds['train'] = train
ds['test'] = validation

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets.set_format("torch")

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=32)

### Additions for  Distributed Training using HF's Accelerate library

In [12]:
from accelerate import Accelerator
accelerator = Accelerator()

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
    )



In [14]:
from tqdm import tqdm 

num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 200/200 [02:24<00:00,  1.39it/s]

In [15]:
import evaluate
import torch

metric = evaluate.load("accuracy")
model.eval()

for batch in eval_dataloader:
  with torch.no_grad():
    outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.7}

In [17]:
metric = evaluate.load("BucketHeadP65/confusion_matrix")
model.eval()

for batch in eval_dataloader:
  with torch.no_grad():
    outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'confusion_matrix': array([[7, 4],
        [2, 7]])}