In [1]:
!pip install datasets transformers --quiet

In [2]:
import pandas as pd

## load data into data-frame
data = pd.read_csv('./demo_bot_intents.csv')

In [3]:
from datasets import Dataset

def conv_to_ds(_data, _is_train):
  filtered_df = _data[_data['is_train']==int(_is_train)][['text', 'lables']]
  filtered_df = filtered_df.rename(columns={"lables": "labels"})
  return Dataset.from_pandas(filtered_df).shuffle(seed=42)

## creation of data set can also be done directly using seperate data file, for ex.
## train_dataset = Dataset.from_pandas(train_data)
## validation_dataset = Dataset.from_pandas(validation_data)
train_dataset  = conv_to_ds(data, _is_train=True)
validation_dataset = conv_to_ds(data, _is_train=False)

In [4]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

## choose model
model_name = "roberta-base"

## load tockenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

## configure model
# config = AutoConfig.from_pretrained(
#         model_name,
#         num_labels=5
#         )

## load model
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=5)

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

## tokenize dataset - 
## create base units of sentences - words, part-of-words, seperators, etc 
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
import numpy as np
from datasets import load_metric

## optional - define evaluation metric 
metric = load_metric("accuracy")

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from transformers import TrainingArguments, Trainer

## configure training arguments
training_args = TrainingArguments(output_dir="./test_trainer", evaluation_strategy="epoch", num_train_epochs=10,learning_rate=5e-05)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 50


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.589199,0.2
2,No log,1.515097,0.525
3,No log,1.250162,0.575
4,No log,0.966286,0.7
5,No log,0.743819,0.75
6,No log,0.564199,0.85
7,No log,0.443852,0.875
8,No log,0.394411,0.875
9,No log,0.368025,0.875
10,No log,0.35728,0.875


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forwa

TrainOutput(global_step=50, training_loss=0.8202389526367188, metrics={'train_runtime': 48.9374, 'train_samples_per_second': 8.174, 'train_steps_per_second': 1.022, 'total_flos': 105247256985600.0, 'train_loss': 0.8202389526367188, 'epoch': 10.0})

In [9]:
trainer.save_model("./inferenced_model")

Saving model checkpoint to ./inferenced_model
Configuration saved in ./inferenced_model/config.json
Model weights saved in ./inferenced_model/pytorch_model.bin


In [10]:
tokenizer.save_pretrained("./inferenced_model")

tokenizer config file saved in ./inferenced_model/tokenizer_config.json
Special tokens file saved in ./inferenced_model/special_tokens_map.json


('./inferenced_model/tokenizer_config.json',
 './inferenced_model/special_tokens_map.json',
 './inferenced_model/vocab.json',
 './inferenced_model/merges.txt',
 './inferenced_model/added_tokens.json',
 './inferenced_model/tokenizer.json')

In [12]:
model_2 = AutoModelForSequenceClassification.from_pretrained("./inferenced_model",num_labels=5)

loading configuration file ./inferenced_model/config.json
Model config RobertaConfig {
  "_name_or_path": "./inferenced_model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "type_vocab

In [13]:
tokenizer_2 = AutoTokenizer.from_pretrained("./inferenced_model")

Didn't find file ./inferenced_model/added_tokens.json. We won't load it.
loading file ./inferenced_model/vocab.json
loading file ./inferenced_model/merges.txt
loading file ./inferenced_model/tokenizer.json
loading file None
loading file ./inferenced_model/special_tokens_map.json
loading file ./inferenced_model/tokenizer_config.json


In [50]:
model_2.eval()
text = "i want to place an order, please"
logit = tokenizer_2(text, truncation=True, return_tensors="pt")
model_2(logit.input_ids, attention_mask=logit.attention_mask)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.6924,  1.6424,  1.3974, -0.7783, -1.1298]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)