In [1]:
# Transformers installation
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
Collectin

## Loading Data

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

df_train= pd.read_csv('/content/train.csv')
df_dev= pd.read_csv('/content/dev.csv')

df_train.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)
df_dev.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)

df_train['label']= [df_train['label'][i]-1 for i in range(len(df_train['label']))]  #since model uses labels 0-3
df_dev['label']= [df_dev['label'][i]-1 for i in range(len(df_dev['label']))]


train_data = Dataset.from_pandas(df_train) #.select(range(184))
dev_data = Dataset.from_pandas(df_dev) #.select(range(48))

In [3]:
train_data[0]

{'text': 'Tell me about Obama family tree.', 'label': 1}

## Tokenizing Data

In [4]:
from transformers import AutoTokenizer, OpenAIGPTForSequenceClassification

from transformers import AutoModelForSequenceClassification, AutoModel
# from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import AutoTokenizer, OPTForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('ArthurZ/opt-350m-dummy-sc', use_fast=False, model_max_length= 256)
model = OPTForSequenceClassification.from_pretrained('ArthurZ/opt-350m-dummy-sc', num_labels=4, ignore_mismatched_sizes=True)

# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))

# model.config.pad_token_id = tokenizer.pad_token_id

# def tokenize_function(examples):

#     return tokenizer(examples["text"], padding="max_length", truncation=True) #padding="max_length", add this argument if needed

def tokenize_function(examples):

    tnizer = tokenizer(examples["text"], padding="max_length", truncation=True)
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.resize_token_embeddings(len(tokenizer))
    return tnizer #padding="max_length", add this argument if needed

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_dev = dev_data.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at ArthurZ/opt-350m-dummy-sc and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 512]) in the checkpoint and torch.Size([4, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [5]:
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_dev = tokenized_dev.remove_columns(["text"])
tokenized_dev = tokenized_dev.rename_column("label", "labels")
tokenized_dev.set_format("torch")

### Dataloader

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train, batch_size=8)
eval_dataloader = DataLoader(tokenized_dev, batch_size=8)

### Model

In [7]:
import torch

In [8]:
# from transformers import AutoModelForSequenceClassification, AutoModel

# model = AutoModelForSequenceClassification.from_pretrained("t5", num_labels=4)

In [9]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [10]:
from transformers import get_scheduler

num_epochs = 40
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
model.to(device)

OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_featur

### Training

In [12]:
import os
from datasets import load_metric
from tqdm.auto import tqdm

In [13]:
def train_one_batch(model, batch, optimizer, lr_scheduler):
  batch = {k: v.to(device) for k, v in batch.items()}
  outputs = model(**batch)
  loss = outputs.loss
  loss.backward()

  optimizer.step()
  lr_scheduler.step()
  optimizer.zero_grad()

  return loss, model, optimizer, lr_scheduler


def train_one_epoch(model, train_dataloader, optimizer, lr_scheduler):
  model.train()
  loss = 0
  progress_bar = tqdm(range(len(train_dataloader)))

  for batch in train_dataloader:
    running_loss, model, optimizer, lr_scheduler = train_one_batch(model, batch, optimizer, lr_scheduler)
    loss += running_loss
    progress_bar.update(1)

  loss = loss / len(train_dataloader)
  return loss, model, optimizer, lr_scheduler
  

def get_val_loss(model, val_dataloader):
  model.eval()
  loss = 0

  metric = load_metric('f1')
  with torch.no_grad():
    for batch in val_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      running_loss = outputs.loss
      loss += running_loss

      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])
  
  f1_score = metric.compute(average= 'weighted')
  
  return loss / len(val_dataloader), f1_score

In [14]:
# num_epochs=10
for epoch in range(num_epochs):
  print(f'Epoch: [{epoch+1} / {num_epochs}]:')
  t_loss, model, optimizer, lr_scheduler = train_one_epoch(model, train_dataloader, optimizer, lr_scheduler)
  v_loss, f1_score = get_val_loss(model, eval_dataloader)
  # print
  print(f"\tLoss -> Train: {t_loss:.5f} | Val: {v_loss} | F1 Score: {f1_score['f1']}")
  # save
  # DST Folder
  DST = '/content/checkpoints'
  if not os.path.isdir(DST):
    os.makedirs(DST)
  path = os.path.join(DST, f'epoch_{epoch+1}.pth')
  torch.save(model.state_dict(), path)

Epoch: [1 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

  metric = load_metric('f1')


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

	Loss -> Train: 2.08649 | Val: 1.414610505104065 | F1 Score: 0.38066666666666665
Epoch: [2 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 1.05459 | Val: 1.5928634405136108 | F1 Score: 0.3277820076137047
Epoch: [3 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.40131 | Val: 2.339188814163208 | F1 Score: 0.4342089411818698
Epoch: [4 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.17033 | Val: 2.6689932346343994 | F1 Score: 0.26413071895424833
Epoch: [5 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.05616 | Val: 3.0574655532836914 | F1 Score: 0.40800375234521574
Epoch: [6 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.11052 | Val: 3.9219017028808594 | F1 Score: 0.4413186813186813
Epoch: [7 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.10911 | Val: 3.9862704277038574 | F1 Score: 0.3673846153846154
Epoch: [8 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.09759 | Val: 4.334994792938232 | F1 Score: 0.2293295019157088
Epoch: [9 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.05712 | Val: 2.4362359046936035 | F1 Score: 0.3259203491543917
Epoch: [10 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.07834 | Val: 4.1941142082214355 | F1 Score: 0.3
Epoch: [11 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.04594 | Val: 3.327260732650757 | F1 Score: 0.3296727272727273
Epoch: [12 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.01543 | Val: 3.138148784637451 | F1 Score: 0.4390769230769231
Epoch: [13 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00085 | Val: 2.6724655628204346 | F1 Score: 0.48479999999999995
Epoch: [14 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00016 | Val: 2.6868295669555664 | F1 Score: 0.46768831168831176
Epoch: [15 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00019 | Val: 2.7196192741394043 | F1 Score: 0.48479999999999995
Epoch: [16 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00010 | Val: 2.735668420791626 | F1 Score: 0.48479999999999995
Epoch: [17 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00010 | Val: 2.7471139430999756 | F1 Score: 0.48479999999999995
Epoch: [18 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00010 | Val: 2.7630999088287354 | F1 Score: 0.48479999999999995
Epoch: [19 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00008 | Val: 2.7723495960235596 | F1 Score: 0.48479999999999995
Epoch: [20 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00010 | Val: 2.782440662384033 | F1 Score: 0.48479999999999995
Epoch: [21 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00007 | Val: 2.790336847305298 | F1 Score: 0.48479999999999995
Epoch: [22 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.798110246658325 | F1 Score: 0.48479999999999995
Epoch: [23 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.8037595748901367 | F1 Score: 0.48479999999999995
Epoch: [24 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00007 | Val: 2.815122365951538 | F1 Score: 0.48479999999999995
Epoch: [25 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.8228394985198975 | F1 Score: 0.46768831168831176
Epoch: [26 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.82818865776062 | F1 Score: 0.46768831168831176
Epoch: [27 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.8356130123138428 | F1 Score: 0.46768831168831176
Epoch: [28 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00006 | Val: 2.8427987098693848 | F1 Score: 0.46768831168831176
Epoch: [29 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00005 | Val: 2.8491199016571045 | F1 Score: 0.46768831168831176
Epoch: [30 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00005 | Val: 2.853804111480713 | F1 Score: 0.46768831168831176
Epoch: [31 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.8569445610046387 | F1 Score: 0.46768831168831176
Epoch: [32 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.861023187637329 | F1 Score: 0.46768831168831176
Epoch: [33 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.8650083541870117 | F1 Score: 0.46768831168831176
Epoch: [34 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00005 | Val: 2.8691697120666504 | F1 Score: 0.46768831168831176
Epoch: [35 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.871406316757202 | F1 Score: 0.46768831168831176
Epoch: [36 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00005 | Val: 2.8731188774108887 | F1 Score: 0.46768831168831176
Epoch: [37 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.874603509902954 | F1 Score: 0.46768831168831176
Epoch: [38 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.8753912448883057 | F1 Score: 0.46768831168831176
Epoch: [39 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.8755805492401123 | F1 Score: 0.46768831168831176
Epoch: [40 / 40]:


  0%|          | 0/24 [00:00<?, ?it/s]

	Loss -> Train: 0.00004 | Val: 2.875666618347168 | F1 Score: 0.46768831168831176


### Testing

In [15]:
df_test= pd.read_csv('/content/test_with_labels.csv')

df_test.rename(columns= {'initial_request': 'text','clarification_need': 'label'}, inplace= True)
df_test['label']= [df_test['label'][i]-1 for i in range(len(df_test['label']))]
test_data = Dataset.from_pandas(df_test) #.select(range(184))
tokenized_test = test_data.map(tokenize_function, batched=True)

tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_test.set_format("torch")
test_dataloader = DataLoader(tokenized_test, batch_size=8)

# model = TheModelClass(*args, **kwargs)
# modelnew = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=4)
# modelnew.to(device)
# modelnew.load_state_dict(torch.load('/content/checkpoints/epoch_20.pth'))
# modelnew.eval()
# modelnew= torch.load('/content/checkpoints/epoch_8.pth',weights_only= True)

test_loss, f1_score = get_val_loss(model, test_dataloader)

print(f"Test loss is: {test_loss:.5f}, F1 score is: {f1_score['f1']:.5f}")

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Test loss is: 2.67111, F1 score is: 0.43480


1
1
1


KeyboardInterrupt: ignored