**Reference**:

https://towardsdatascience.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface-94b2ed6f798f

https://predictivehacks.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface/


# Don't normalize the data 

# Linear Regression 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from transformers.utils import logging
logging.set_verbosity_error()
pd.set_option('display.max_colwidth', None)

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer


In [2]:
! pwd

/Users/yousef/code/thesis-nlp/code-wip-dataset


In [3]:
FILE_PATH = '/Users/yousef/code/thesis-nlp/dataset/incident-ohlc-v06-scalar.csv'
COLUMNS =  ['w_open', 'w_high', 'w_low', 'w_close', 'new', 'started','done', 'dayofweek', 'dayofmonth', 'dayofyear', 'target', 'y_ystrdy']

# N_ROWS = 100
# df_ = pd.read_csv(FILE_PATH,parse_dates=['date'],index_col=['date'], nrows=N_ROWS)
df_ = pd.read_csv(FILE_PATH,parse_dates=['date'],index_col=['date'])
df_ = (df_.sort_index())[COLUMNS]
df_ = df_.iloc[1:-1]  # Remove NaN

max_number_in_data = int(df_.describe().transpose()["max"].max())
print(f'Max number in the data is {max_number_in_data}')
display(df_.head(2))

df = pd.DataFrame()
df['label']= df_['target']
df['text']= ''
for col in COLUMNS[:-2]:
    df_[col] = df_[col].apply(lambda x: f'{x:.0f}')
    df['text'] = df['text'] + f' {col} ' + df_[col].astype(str)

display(df.head(2))

Max number in the data is 366


Unnamed: 0_level_0,w_open,w_high,w_low,w_close,new,started,done,dayofweek,dayofmonth,dayofyear,target,y_ystrdy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-14,0.0,1.0,0.0,1.0,3.0,1.0,0.0,3,14,14,1.0,0.0
2010-01-15,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4,15,15,1.0,2.0


Unnamed: 0_level_0,label,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-14,1.0,w_open 0 w_high 1 w_low 0 w_close 1 new 3 started 1 done 0 dayofweek 3 dayofmonth 14 dayofyear 14
2010-01-15,1.0,w_open 1 w_high 1 w_low 1 w_close 1 new 1 started 0 done 0 dayofweek 4 dayofmonth 15 dayofyear 15


In [4]:
df.iloc[6]

label                                                                                                   6.0
text      w_open 3 w_high 4 w_low 3 w_close 4 new 0 started 1 done 0 dayofweek 2 dayofmonth 20 dayofyear 20
Name: 2010-01-20 00:00:00, dtype: object

In [5]:
# ','.join(df[['text']])
# df['text']
# ' '.join(df['text'][0:3].to_list())
# # df['label'][0]

## Create and customize AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
for x in COLUMNS:
    tokenizer.add_tokens(x)
    
for x in range(max_number_in_data):
    tokenizer.add_tokens(str(x))
    
        
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


## Pandas To Dataset

In [7]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from datasets import Dataset


df_lb = pd.DataFrame()
df_lb['label'] = ''
df_lb['text'] = ''

LOOKBACK_WINDOW =5
for i in range(LOOKBACK_WINDOW, len(df)):
    text = ' '.join(df['text'][i - LOOKBACK_WINDOW:i].to_list())
    label = df['label'][i]
    df_lb.loc[len(df_lb.index)] = [label,text] 

datasets = []
N_SPLIT = 10
tscv = TimeSeriesSplit(n_splits=N_SPLIT)
for train_index, test_index in tscv.split(df_lb):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    df_tr = df_lb.iloc[train_index]
    df_ts = df_lb.iloc[test_index]
    tr_ds = Dataset.from_pandas(df_tr, split="train",preserve_index=False)
    ts_ds = Dataset.from_pandas(df_ts, split="test",preserve_index=False)
    datasets.append({'train':tr_ds,'test':ts_ds})

print(f'length of datasets is {len(datasets)}')
print({datasets[0]['train']})
print({datasets[0]['test']})

TRAIN: 135 TEST: 131
TRAIN: 266 TEST: 131
TRAIN: 397 TEST: 131
TRAIN: 528 TEST: 131
TRAIN: 659 TEST: 131
TRAIN: 790 TEST: 131
TRAIN: 921 TEST: 131
TRAIN: 1052 TEST: 131
TRAIN: 1183 TEST: 131
TRAIN: 1314 TEST: 131
length of datasets is 10
{Dataset({
    features: ['label', 'text'],
    num_rows: 135
})}
{Dataset({
    features: ['label', 'text'],
    num_rows: 131
})}


## Fine-Tuning The Model

### Metrics Function


In [8]:
from datasets import load_metric
from sklearn.metrics import mean_squared_error
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # rmse = mean_squared_error(labels, predictions, squared=False)
    rmse = mean_squared_error(labels, predictions, squared=True)
    return {"rmse": rmse}

def compute_metrics_mape(eval_pred):
    predictions, labels = eval_pred
    mape = mape_metric.compute(predictions=predictions, references=labels)
    return {"mape": mape}

In [9]:
# num_labels =1 means regression
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False,
                                  report_to="none",
                                  optim="adamw_torch",
                                  # optim="adagrad"
                                  

                                  )

### Train The Model

In [None]:
train_output = []
test_output = []

transformers.utils.logging.set_verbosity_error()

for idx, ds in enumerate(datasets):
    tokenized_train_ds = ds['train'].map(tokenize_function, batched=True)
    tokenized_test_ds = ds['test'].map(tokenize_function, batched=True)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_test_ds,
        compute_metrics=compute_metrics
    )
    print(f'\n<= Training {idx} of {len(datasets)} .... =>')
    trainer.train()
    train_output.append(trainer.evaluate(tokenized_train_ds))
    test_output.append(trainer.evaluate())
print('\n<= Train is done =>')


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 135
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27
  Number of trainable parameters = 66969601



<= Training 0 of 10 .... =>


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 5714.3286, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 9944.43359375, 'eval_rmse': 9944.43359375, 'eval_runtime': 16.7699, 'eval_samples_per_second': 7.812, 'eval_steps_per_second': 0.537, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 5340.3021, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 9724.4609375, 'eval_rmse': 9724.4609375, 'eval_runtime': 17.3918, 'eval_samples_per_second': 7.532, 'eval_steps_per_second': 0.517, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 5325.852, 'learning_rate': 0.0, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 135
  Batch size = 16


{'eval_loss': 9656.8359375, 'eval_rmse': 9656.8349609375, 'eval_runtime': 16.2174, 'eval_samples_per_second': 8.078, 'eval_steps_per_second': 0.555, 'epoch': 3.0}
{'train_runtime': 238.984, 'train_samples_per_second': 1.695, 'train_steps_per_second': 0.113, 'train_loss': 5460.16087962963, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'eval_loss': 5312.42333984375, 'eval_rmse': 5312.421875, 'eval_runtime': 16.7429, 'eval_samples_per_second': 8.063, 'eval_steps_per_second': 0.538, 'epoch': 3.0}
{'eval_loss': 9656.8359375, 'eval_rmse': 9656.8349609375, 'eval_runtime': 16.3758, 'eval_samples_per_second': 8.0, 'eval_steps_per_second': 0.55, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 266
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 51
  Number of trainable parameters = 66969601



<= Training 1 of 10 .... =>


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 7297.0528, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 19515.29296875, 'eval_rmse': 19515.294921875, 'eval_runtime': 16.0087, 'eval_samples_per_second': 8.183, 'eval_steps_per_second': 0.562, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 7016.4426, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 19191.33984375, 'eval_rmse': 19191.33984375, 'eval_runtime': 16.7533, 'eval_samples_per_second': 7.819, 'eval_steps_per_second': 0.537, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 6929.0028, 'learning_rate': 0.0, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 266
  Batch size = 16


{'eval_loss': 19076.654296875, 'eval_rmse': 19076.65625, 'eval_runtime': 16.5329, 'eval_samples_per_second': 7.924, 'eval_steps_per_second': 0.544, 'epoch': 3.0}
{'train_runtime': 414.4615, 'train_samples_per_second': 1.925, 'train_steps_per_second': 0.123, 'train_loss': 7080.832720588235, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'eval_loss': 6874.45458984375, 'eval_rmse': 6874.4541015625, 'eval_runtime': 33.9205, 'eval_samples_per_second': 7.842, 'eval_steps_per_second': 0.501, 'epoch': 3.0}
{'eval_loss': 19076.654296875, 'eval_rmse': 19076.65625, 'eval_runtime': 16.8757, 'eval_samples_per_second': 7.763, 'eval_steps_per_second': 0.533, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 397
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 75
  Number of trainable parameters = 66969601



<= Training 2 of 10 .... =>


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 10643.0, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 28529.65234375, 'eval_rmse': 28529.65234375, 'eval_runtime': 16.5806, 'eval_samples_per_second': 7.901, 'eval_steps_per_second': 0.543, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 10118.6756, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 27841.40234375, 'eval_rmse': 27841.400390625, 'eval_runtime': 16.4756, 'eval_samples_per_second': 7.951, 'eval_steps_per_second': 0.546, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 9850.415, 'learning_rate': 0.0, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 397
  Batch size = 16


{'eval_loss': 27601.236328125, 'eval_rmse': 27601.236328125, 'eval_runtime': 16.5137, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 0.545, 'epoch': 3.0}
{'train_runtime': 600.7858, 'train_samples_per_second': 1.982, 'train_steps_per_second': 0.125, 'train_loss': 10204.030208333334, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'eval_loss': 9790.591796875, 'eval_rmse': 9790.591796875, 'eval_runtime': 51.3265, 'eval_samples_per_second': 7.735, 'eval_steps_per_second': 0.487, 'epoch': 3.0}
{'eval_loss': 27601.236328125, 'eval_rmse': 27601.236328125, 'eval_runtime': 17.0721, 'eval_samples_per_second': 7.673, 'eval_steps_per_second': 0.527, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 528
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 99
  Number of trainable parameters = 66969601



<= Training 3 of 10 .... =>


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 13670.8883, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 18833.166015625, 'eval_rmse': 18833.16796875, 'eval_runtime': 16.9673, 'eval_samples_per_second': 7.721, 'eval_steps_per_second': 0.53, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 12744.1136, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 17958.390625, 'eval_rmse': 17958.392578125, 'eval_runtime': 17.1228, 'eval_samples_per_second': 7.651, 'eval_steps_per_second': 0.526, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'loss': 12271.8892, 'learning_rate': 0.0, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 528
  Batch size = 16


{'eval_loss': 17657.205078125, 'eval_rmse': 17657.203125, 'eval_runtime': 16.5998, 'eval_samples_per_second': 7.892, 'eval_steps_per_second': 0.542, 'epoch': 3.0}
{'train_runtime': 776.1483, 'train_samples_per_second': 2.041, 'train_steps_per_second': 0.128, 'train_loss': 12895.630366161617, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 131
  Batch size = 16


{'eval_loss': 12176.06640625, 'eval_rmse': 12176.06640625, 'eval_runtime': 68.0673, 'eval_samples_per_second': 7.757, 'eval_steps_per_second': 0.485, 'epoch': 3.0}
{'eval_loss': 17657.205078125, 'eval_rmse': 17657.203125, 'eval_runtime': 16.8956, 'eval_samples_per_second': 7.754, 'eval_steps_per_second': 0.533, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 659
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 126
  Number of trainable parameters = 66969601



<= Training 4 of 10 .... =>


In [None]:
# tr_loss = [x.training_loss for x in train_output]
tr_loss = [x['eval_loss'] for x in train_output]
ts_loss = [x['eval_loss'] for x in test_output]
plt.plot(tr_loss,label='tr_loss')
plt.plot(ts_loss,label='ts_loss')
plt.legend()

In [None]:
# tr_loss = [x.metrics['train_runtime'] for x in train_output]
ts_loss = [x['eval_rmse'] for x in train_output]
ts_loss = [x['eval_rmse'] for x in test_output]
plt.plot(tr_loss,label='tr_rmse')
plt.plot(ts_loss,label='ts_rmse')
plt.legend()

In [None]:
train_output[0]

In [None]:
# tokenized_test_ds['text']
# tokenized_test_ds['text']


# Train and test chart and metrics

In [None]:
print('\n<= Predicting =>')
predictions_test = trainer.predict(tokenized_datasets["test"])
predictions_train = trainer.predict(tokenized_datasets["train"])
print('\n<= Predictions are done =>')

In [None]:
data = [predictions_train.predictions,predictions_test.predictions]
fig, axs = plt.subplots(nrows=1,ncols=2,figsize=(10, 5))

axs[0].plot(data[0],label='train')
axs[0].plot(dataset["train"]['label'],label='y')
axs[0].title.set_text('train ')
axs[0].legend(loc='upper right')


axs[1].plot(data[1],label='test')
axs[1].plot(dataset["test"]['label'],label='y')
axs[1].title.set_text('test ')
axs[1].legend(loc='upper right')


In [None]:
plt.plot(predictions_test.predictions)

In [None]:
print(f'key\t\t train.metric \t test.metrics')

for key in list(predictions_test.metrics.keys())[:2]:
    print(f'{key} \t\t{predictions_train.metrics[key]:0.3f} \t\t{predictions_test.metrics[key]:0.3f}')

## Save And Load The Pre-Trained Model And Tokenizer


In [None]:
# save the model/tokenizer

model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")

# load the model/tokenizer

from transformers import AutoModelForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

##  Use The Model

In [None]:
from transformers import Trainer
trainer = Trainer(model=model)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True) 

def pipeline_prediction(text):
    df=pd.DataFrame({'text':[text]})
    dataset = Dataset.from_pandas(df,preserve_index=False) 
    tokenized_datasets = dataset.map(tokenize_function)
    raw_pred, _, _ = trainer.predict(tokenized_datasets) 
    return(raw_pred[0][0])

pipeline_prediction("🚨 Get 50% now!")