In [1]:
!pip install datasets transformers --quiet

[K     |████████████████████████████████| 365 kB 7.2 MB/s 
[K     |████████████████████████████████| 4.7 MB 29.1 MB/s 
[K     |████████████████████████████████| 212 kB 45.1 MB/s 
[K     |████████████████████████████████| 115 kB 49.8 MB/s 
[K     |████████████████████████████████| 120 kB 34.3 MB/s 
[K     |████████████████████████████████| 127 kB 3.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 44.9 MB/s 
[?25h

In [2]:
import pandas as pd

## load data into data-frame
data = pd.read_csv('./demo_bot_intents.csv')

In [3]:
from datasets import Dataset

def conv_to_ds(_data, _is_train):
  filtered_df = _data[_data['is_train']==int(_is_train)][['text', 'lables']]
  filtered_df = filtered_df.rename(columns={"lables": "labels"})
  return Dataset.from_pandas(filtered_df)

## creation of data set can also be done directly using seperate data file, for ex.
## train_dataset = Dataset.from_pandas(train_data)
## validation_dataset = Dataset.from_pandas(validation_data)
train_dataset  = conv_to_ds(data, _is_train=True)
validation_dataset = conv_to_ds(data, _is_train=False)

In [4]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

## choose model
model_name = "roberta-base"

## load tockenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

## configure model
config = AutoConfig.from_pretrained(
        model_name,
        num_labels=5
        )

## load model
model = AutoModelForSequenceClassification.from_config(config)

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

## tokenize dataset - 
## create base units of sentences - words, part-of-words, seperators, etc 
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
import numpy as np
from datasets import load_metric

## optional - define evaluation metric 
metric = load_metric("accuracy")

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from transformers import TrainingArguments, Trainer

## configure training arguments
training_args = TrainingArguments(output_dir="./test_trainer", evaluation_strategy="epoch", num_train_epochs=30,learning_rate=5e-05)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 150


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.793029,0.2
2,No log,1.65127,0.2
3,No log,1.651574,0.2
4,No log,1.625083,0.2
5,No log,1.55063,0.225
6,No log,1.513872,0.25
7,No log,1.686228,0.55
8,No log,1.07086,0.6
9,No log,1.420025,0.525
10,No log,0.908911,0.625


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forwa

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.793029,0.2
2,No log,1.65127,0.2
3,No log,1.651574,0.2
4,No log,1.625083,0.2
5,No log,1.55063,0.225
6,No log,1.513872,0.25
7,No log,1.686228,0.55
8,No log,1.07086,0.6
9,No log,1.420025,0.525
10,No log,0.908911,0.625


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSequenceClassification.forwa

TrainOutput(global_step=150, training_loss=0.4312566630045573, metrics={'train_runtime': 8869.3626, 'train_samples_per_second': 0.135, 'train_steps_per_second': 0.017, 'total_flos': 315741770956800.0, 'train_loss': 0.4312566630045573, 'epoch': 30.0})

In [9]:
trainer.save_model("./trained_model")

Saving model checkpoint to ./trained_model
Configuration saved in ./trained_model/config.json
Model weights saved in ./trained_model/pytorch_model.bin


In [11]:
trainer.save_state()

In [12]:
tokenizer.save_pretrained("./trained_model")

tokenizer config file saved in ./trained_model/tokenizer_config.json
Special tokens file saved in ./trained_model/special_tokens_map.json


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.json',
 './trained_model/merges.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [34]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

## load the post-trained model
trained_model_name = "./trained_model"
trained_tokenizer = AutoTokenizer.from_pretrained(trained_model_name)

trained_config = AutoConfig.from_pretrained(
        trained_model_name,
        num_labels=5
        )
trained_model = AutoModelForSequenceClassification.from_config(trained_config)

Didn't find file ./trained_model/added_tokens.json. We won't load it.
loading file ./trained_model/vocab.json
loading file ./trained_model/merges.txt
loading file ./trained_model/tokenizer.json
loading file None
loading file ./trained_model/special_tokens_map.json
loading file ./trained_model/tokenizer_config.json
loading configuration file ./trained_model/config.json
Model config RobertaConfig {
  "_name_or_path": "./trained_model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e

In [None]:
import numpy as np

trained_model.eval()
text = "i want to order"
logit = trained_tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
trained_model(logit.input_ids, attention_mask=logit.attention_mask)
## np.argmax(logit, axis=-1)