**Reference**:

https://towardsdatascience.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface-94b2ed6f798f

https://predictivehacks.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface/


# Linear Regression 

In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from transformers.utils import logging
logging.set_verbosity_error()

## Create AutoTokenizer

In [137]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

## Pandas To Dataset

In [138]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler


columns = ['text','wip','activity_nlp','resource_nlp','case_nlp']

def make_dataset(dataset, iloc_from, iloc_to):
    df_ = dataset[columns].iloc[iloc_from:iloc_to]  
    df_[['wip']] = MinMaxScaler().fit_transform(df_[['wip']])
    df_= df_.rename(columns={"wip": "labels"})

    df_.reset_index(inplace=True,drop=True)
    return df_

df = pd.read_csv('df_nlp_real.csv',usecols=columns,dtype={'wip':'float'})
df_nlp = make_dataset(df, 200,300)
display(df_nlp.head(3))

dataset = Dataset.from_pandas(df_nlp[['text','labels']],preserve_index=False) 
dataset = dataset.train_test_split(test_size=0.3) 



def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

dataset

Unnamed: 0,text,labels,activity_nlp,resource_nlp,case_nlp
0,R2 starts A3 on C1030 at 2010-02-22 12:05,0.8,A3,R2,C1030
1,R2 starts A1 on C1328 at 2010-02-22 13:00,0.0,A1,R2,C1328
2,R2 starts A2 on C1328 at 2010-02-22 13:01,0.816667,A2,R2,C1328


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 70
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 30
    })
})

## Tokenization & How To Add New Tokens


In [139]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

token_columns = ['activity_nlp','resource_nlp','case_nlp']
for x in token_columns:
    for y in df_nlp[x].unique():
        tokenizer.add_tokens(y)
        
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

#### Test Tokenization

In [140]:
text = df_nlp["text"][0]
tokens = tokenizer(text)['input_ids']
actual_tokens = [tokenizer.decode(i) for i in tokens]

print(f'text  :{text} \ntokens:{tokens} \nactual token:{actual_tokens}')

text  :R2 starts A3 on C1030 at 2010-02-22 12:05 
tokens:[101, 30529, 4627, 30522, 2006, 30536, 2012, 2230, 1011, 6185, 1011, 2570, 2260, 1024, 5709, 102] 
actual token:['[CLS]', 'R2', 'starts', 'A3', 'on', 'C1030', 'at', '2010', '-', '02', '-', '22', '12', ':', '05', '[SEP]']


## Fine-Tuning The Model

In [141]:
from transformers import AutoModelForSequenceClassification

# num_labels =1 means regression
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
model.resize_token_embeddings(len(tokenizer))

Embedding(30581, 768)

### Metrics Function


In [142]:
from datasets import load_metric
from sklearn.metrics import mean_squared_error
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_metrics_mape(eval_pred):
    predictions, labels = eval_pred
    mape = mape_metric.compute(predictions=predictions, references=labels)
    return {"mape": mape}

### Train The Model

In [143]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False,
                                  report_to="none"
                                  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 70
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 15
  Number of trainable parameters = 66999553
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16


{'loss': 0.2383, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 0.2794472575187683, 'eval_rmse': 0.5286276936531067, 'eval_runtime': 3.4499, 'eval_samples_per_second': 8.696, 'eval_steps_per_second': 0.58, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16


{'loss': 0.159, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 0.17678657174110413, 'eval_rmse': 0.42045995593070984, 'eval_runtime': 3.4631, 'eval_samples_per_second': 8.663, 'eval_steps_per_second': 0.578, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16


{'loss': 0.2052, 'learning_rate': 0.0, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.1526743769645691, 'eval_rmse': 0.3907356560230255, 'eval_runtime': 3.4524, 'eval_samples_per_second': 8.69, 'eval_steps_per_second': 0.579, 'epoch': 3.0}
{'train_runtime': 108.1363, 'train_samples_per_second': 1.942, 'train_steps_per_second': 0.139, 'train_loss': 0.2008363167444865, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=0.2008363167444865, metrics={'train_runtime': 108.1363, 'train_samples_per_second': 1.942, 'train_steps_per_second': 0.139, 'train_loss': 0.2008363167444865, 'epoch': 3.0})

# Train and test chart and metrics

In [None]:
predictions_test = trainer.predict(tokenized_datasets["test"])
predictions_train = trainer.predict(tokenized_datasets["train"])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 30
  Batch size = 16
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 70
  Batch size = 16


In [None]:
data = [predictions_train.predictions,predictions_test.predictions]
fig, axs = plt.subplots(nrows=1,ncols=2,figsize=(10, 5))

axs[0].plot(data[0],label='train')
axs[0].title.set_text('train ')

axs[1].plot(data[1],label='train')
axs[1].title.set_text('test ')

In [None]:
print(f'key\t\t train.metric \t test.metrics')

for key in list(predictions_test.metrics.keys())[:2]:
    print(f'{key} \t\t{predictions_train.metrics[key]:0.3f} \t\t{predictions_test.metrics[key]:0.3f}')

## Save And Load The Pre-Trained Model And Tokenizer


In [None]:
# save the model/tokenizer

model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")

# load the model/tokenizer

from transformers import AutoModelForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

##  Use The Model

In [None]:
from transformers import Trainer
trainer = Trainer(model=model)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True) 

def pipeline_prediction(text):
    df=pd.DataFrame({'text':[text]})
    dataset = Dataset.from_pandas(df,preserve_index=False) 
    tokenized_datasets = dataset.map(tokenize_function)
    raw_pred, _, _ = trainer.predict(tokenized_datasets) 
    return(raw_pred[0][0])

pipeline_prediction("🚨 Get 50% now!")