In [1]:
# Transformers installation
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collectin

## Loading Data

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

df_train= pd.read_csv('/content/train.csv')
df_dev= pd.read_csv('/content/dev.csv')

df_train.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)
df_dev.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)

df_train['label']= [df_train['label'][i]-1 for i in range(len(df_train['label']))]  #since model uses labels 0-3
df_dev['label']= [df_dev['label'][i]-1 for i in range(len(df_dev['label']))]


train_data = Dataset.from_pandas(df_train) #.select(range(184))
dev_data = Dataset.from_pandas(df_dev) #.select(range(48))

In [3]:
train_data[0]

{'text': 'Tell me about Obama family tree.', 'label': 1}

## Tokenizing Data

In [4]:
from transformers import AutoTokenizer, OpenAIGPTForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("gpt2")
from transformers import AutoModelForSequenceClassification, AutoModel

model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=4)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = tokenizer.pad_token_id
# def tokenize_function(examples):

#     return tokenizer(examples["text"], padding="max_length", truncation=True) #padding="max_length", add this argument if needed

def tokenize_function(examples):

    tnizer = tokenizer(examples["text"], padding="max_length", truncation=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.resize_token_embeddings(len(tokenizer))
    return tnizer #padding="max_length", add this argument if needed

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_dev = dev_data.map(tokenize_function, batched=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [5]:
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_dev = tokenized_dev.remove_columns(["text"])
tokenized_dev = tokenized_dev.rename_column("label", "labels")
tokenized_dev.set_format("torch")

### Dataloader

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train, batch_size=8)
eval_dataloader = DataLoader(tokenized_dev, batch_size=8)

### Model

In [7]:
import torch

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoModel

# model = AutoModelForSequenceClassification.from_pretrained("t5", num_labels=4)

In [8]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [9]:
from transformers import get_scheduler

num_epochs = 50
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [10]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

### Training

In [11]:
import os
from datasets import load_metric
from tqdm.auto import tqdm

In [12]:
def train_one_batch(model, batch, optimizer, lr_scheduler):
  batch = {k: v.to(device) for k, v in batch.items()}
  outputs = model(**batch)
  loss = outputs.loss
  loss.backward()

  optimizer.step()
  lr_scheduler.step()
  optimizer.zero_grad()

  return loss, model, optimizer, lr_scheduler


def train_one_epoch(model, train_dataloader, optimizer, lr_scheduler):
  model.train()
  loss = 0
  progress_bar = tqdm(range(len(train_dataloader)))

  for batch in train_dataloader:
    running_loss, model, optimizer, lr_scheduler = train_one_batch(model, batch, optimizer, lr_scheduler)
    loss += running_loss
    progress_bar.update(1)

  loss = loss / len(train_dataloader)
  return loss, model, optimizer, lr_scheduler
  

def get_val_loss(model, val_dataloader):
  model.eval()
  loss = 0

  metric = load_metric('f1')
  with torch.no_grad():
    for batch in val_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      running_loss = outputs.loss
      loss += running_loss

      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])
  
  f1_score = metric.compute(average= 'weighted')
  
  return loss / len(val_dataloader), f1_score

In [13]:
# num_epochs=10
for epoch in range(num_epochs):
  print(f'Epoch: [{epoch+1} / {num_epochs}]:')
  t_loss, model, optimizer, lr_scheduler = train_one_epoch(model, train_dataloader, optimizer, lr_scheduler)
  v_loss, f1_score = get_val_loss(model, eval_dataloader)
  # print
  print(f"\tLoss -> Train: {t_loss:.5f} | Val: {v_loss} | F1 Score: {f1_score['f1']}")
  # save
  # DST Folder
  DST = '/content/checkpoints'
  if not os.path.isdir(DST):
    os.makedirs(DST)
  path = os.path.join(DST, f'epoch_{epoch+1}.pth')
  torch.save(model.state_dict(), path)

Epoch: [1 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

  metric = load_metric('f1')


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

	Loss -> Train: 1.67422 | Val: 1.1391878128051758 | F1 Score: 0.2836122448979592
Epoch: [2 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 1.16168 | Val: 1.114803671836853 | F1 Score: 0.3720779220779221
Epoch: [3 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 1.05195 | Val: 1.1132296323776245 | F1 Score: 0.2953535353535353
Epoch: [4 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.78610 | Val: 1.0121967792510986 | F1 Score: 0.3624504504504504
Epoch: [5 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.63787 | Val: 1.0433708429336548 | F1 Score: 0.4088888888888889
Epoch: [6 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.48402 | Val: 1.066727876663208 | F1 Score: 0.47395348837209306
Epoch: [7 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.30352 | Val: 1.1803343296051025 | F1 Score: 0.4043243243243243
Epoch: [8 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.21273 | Val: 1.439252495765686 | F1 Score: 0.3460219780219781
Epoch: [9 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.11395 | Val: 1.559278130531311 | F1 Score: 0.38352286636252364
Epoch: [10 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.06041 | Val: 1.4714782238006592 | F1 Score: 0.3366966966966967
Epoch: [11 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.07635 | Val: 1.509538173675537 | F1 Score: 0.3968097165991903
Epoch: [12 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.25253 | Val: 1.2918964624404907 | F1 Score: 0.41778266178266177
Epoch: [13 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.06485 | Val: 1.4142400026321411 | F1 Score: 0.4336190476190476
Epoch: [14 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.04568 | Val: 1.5749125480651855 | F1 Score: 0.4008780487804878
Epoch: [15 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01931 | Val: 1.5079995393753052 | F1 Score: 0.38799084668192224
Epoch: [16 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01657 | Val: 1.5182629823684692 | F1 Score: 0.4142489557226399
Epoch: [17 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01375 | Val: 1.577722430229187 | F1 Score: 0.41270329670329664
Epoch: [18 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01424 | Val: 1.6254290342330933 | F1 Score: 0.41356708407871196
Epoch: [19 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00970 | Val: 1.6348750591278076 | F1 Score: 0.41356708407871196
Epoch: [20 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01433 | Val: 1.588599443435669 | F1 Score: 0.44955347091932457
Epoch: [21 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01453 | Val: 1.6063802242279053 | F1 Score: 0.44955347091932457
Epoch: [22 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01086 | Val: 1.6368052959442139 | F1 Score: 0.43300000000000005
Epoch: [23 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00972 | Val: 1.6310983896255493 | F1 Score: 0.44955347091932457
Epoch: [24 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00793 | Val: 1.6714318990707397 | F1 Score: 0.4128421052631579
Epoch: [25 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00630 | Val: 1.6777993440628052 | F1 Score: 0.42846115288220554
Epoch: [26 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00525 | Val: 1.6808087825775146 | F1 Score: 0.42846115288220554
Epoch: [27 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00514 | Val: 1.7012050151824951 | F1 Score: 0.42846115288220554
Epoch: [28 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01123 | Val: 1.7508207559585571 | F1 Score: 0.433567084078712
Epoch: [29 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01180 | Val: 1.7524312734603882 | F1 Score: 0.411578073089701
Epoch: [30 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01137 | Val: 1.7451248168945312 | F1 Score: 0.433567084078712
Epoch: [31 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00545 | Val: 1.729997992515564 | F1 Score: 0.43302075702075704
Epoch: [32 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00493 | Val: 1.7170288562774658 | F1 Score: 0.44955347091932457
Epoch: [33 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01038 | Val: 1.7132283449172974 | F1 Score: 0.44955347091932457
Epoch: [34 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01102 | Val: 1.712790608406067 | F1 Score: 0.44955347091932457
Epoch: [35 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01071 | Val: 1.7278839349746704 | F1 Score: 0.44955347091932457
Epoch: [36 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00403 | Val: 1.755935788154602 | F1 Score: 0.44955347091932457
Epoch: [37 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00403 | Val: 1.7796140909194946 | F1 Score: 0.43302075702075704
Epoch: [38 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00310 | Val: 1.789417028427124 | F1 Score: 0.43302075702075704
Epoch: [39 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00262 | Val: 1.7962619066238403 | F1 Score: 0.43302075702075704
Epoch: [40 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00971 | Val: 1.800430417060852 | F1 Score: 0.43302075702075704
Epoch: [41 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00618 | Val: 1.789123296737671 | F1 Score: 0.44955347091932457
Epoch: [42 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00329 | Val: 1.722124695777893 | F1 Score: 0.40662623599208964
Epoch: [43 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00294 | Val: 1.7217991352081299 | F1 Score: 0.40662623599208964
Epoch: [44 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00325 | Val: 1.729636549949646 | F1 Score: 0.40662623599208964
Epoch: [45 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00269 | Val: 1.7353461980819702 | F1 Score: 0.40662623599208964
Epoch: [46 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00259 | Val: 1.740735411643982 | F1 Score: 0.40662623599208964
Epoch: [47 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00311 | Val: 1.7424834966659546 | F1 Score: 0.40662623599208964
Epoch: [48 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00279 | Val: 1.7450053691864014 | F1 Score: 0.40662623599208964
Epoch: [49 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00301 | Val: 1.7450342178344727 | F1 Score: 0.40662623599208964
Epoch: [50 / 50]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00335 | Val: 1.744935393333435 | F1 Score: 0.40662623599208964


### Testing

In [14]:
df_test= pd.read_csv('/content/test_with_labels.csv')

df_test.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)
df_test['label']= [df_test['label'][i]-1 for i in range(len(df_test['label']))]
test_data = Dataset.from_pandas(df_test) #.select(range(184))
tokenized_test = test_data.map(tokenize_function, batched=True)

tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_test.set_format("torch")
test_dataloader = DataLoader(tokenized_test, batch_size=8)

# model = TheModelClass(*args, **kwargs)
# modelnew = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
# modelnew.to(device)
# modelnew.load_state_dict(torch.load('/content/checkpoints/epoch_8.pth'))
# modelnew.eval()
# modelnew= torch.load('/content/checkpoints/epoch_8.pth',weights_only= True)

test_loss, f1_score = get_val_loss(model, test_dataloader)

print(f"Test loss is: {test_loss:.5f}, F1 score is: {f1_score['f1']:.5f}")

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Test loss is: 2.00375, F1 score is: 0.50007
