**Reference**:

https://towardsdatascience.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface-94b2ed6f798f

In [1]:
from transformers import AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [2]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [3]:
tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
)

{'input_ids': [[101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], [101, 11312, 18763, 10855, 11530, 112, 162, 39487, 10197, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [4]:
from transformers import pipeline

# Allocate a pipeline for question-answering
question_answerer = pipeline('question-answering')

# Ask a question
answer = question_answerer({
	'question': 'Where is KDnuggets headquartered?',
	'context': 'KDnuggets was founded in February of 1997 by Gregory Piatetsky in Brookline, Massachusetts.'
})

# Print the answer
print(answer)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Metal device set to: Apple M1


2022-12-07 18:31:38.618923: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-07 18:31:38.619273: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFDistilBertForQuestionAnswering.

All the layers of TFDistilBertForQuestionAnswering were initialized from the model checkpoint at distilbert-base-cased-distilled-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


{'score': 0.9153627157211304, 'start': 66, 'end': 90, 'answer': 'Brookline, Massachusetts'}


In [5]:
import numpy as np
import pandas as pd
import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokens = tokenizer('🚨 JUNE DROP LIVE 🚨')['input_ids']
actual_tokens = [tokenizer.decode(i) for i in tokenizer('🚨 JUNE DROP LIVE 🚨')['input_ids']]

print(f'tokens      :{tokens}')
print(f'actual token:{actual_tokens}')

for i in ['🚨', '🙂', '😍', '✌️' , '🤩 ']:
    tokenizer.add_tokens(i)
revised_actual_tokens = [tokenizer.decode(i) for i in tokenizer('🚨 JUNE DROP LIVE 🚨')['input_ids']]

# Now, if you tokenize the sentence you will see that the emoji remains as emoji and not the [UNK] token.
print(f'revised actual tokens:{revised_actual_tokens}')


tokens      :[101, 100, 2238, 4530, 2444, 100, 102]
actual token:['[CLS]', '[UNK]', 'june', 'drop', 'live', '[UNK]', '[SEP]']
revised actual tokens:['[CLS]', '🚨', 'june', 'drop', 'live', '🚨', '[SEP]']


In [7]:
# df_ = pd.read_csv('df_nlp_real.csv')
# df_.head(2)

In [8]:
import pandas as pd 

columns = ['text','wip','activity_nlp','resource_nlp','case_nlp']
def make_dataset(dataset, iloc_from, iloc_to):
    df_ = dataset[columns].iloc[iloc_from:iloc_to]  
    df_.rename(columns={"wip": "labels"})
    df_.reset_index(inplace=True,drop=True)
    return df_

df = pd.read_csv('df_nlp_real.csv',usecols=columns,dtype={'wip':'float'})
df_nlp = make_dataset(df, 200,300)
display(df_nlp.head())

Unnamed: 0,text,wip,activity_nlp,resource_nlp,case_nlp
0,R2 starts A3 on C1030 at 2010-02-22 12:05,48.0,A3,R2,C1030
1,R2 starts A1 on C1328 at 2010-02-22 13:00,0.0,A1,R2,C1328
2,R2 starts A2 on C1328 at 2010-02-22 13:01,49.0,A2,R2,C1328
3,R2 starts A1 on C3185 at 2010-02-23 08:23,0.0,A1,R2,C3185
4,R2 starts A2 on C3185 at 2010-02-23 08:24,50.0,A2,R2,C3185


In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
token_columns = ['activity_nlp','resource_nlp','case_nlp']
for x in token_columns:
    for y in df_nlp[x].unique():
        tokenizer.add_tokens(y)
        
# df_nlp["token"] = df_nlp["text"].apply(lambda x: tokenizer(x , padding="max_length", truncation=True))
df_nlp["token"] = df_nlp["text"].apply(lambda x: tokenizer(x , padding=True, truncation=True))

ls = [df_nlp["token"][x]['input_ids'] for x in range(len(df_nlp["token"]))]
print(f'max len of tokens:{max([len(x) for x in ls])}')

text = df_nlp["text"][0]
tokens = tokenizer(text)['input_ids']
actual_tokens = [tokenizer.decode(i) for i in tokens]

print(f'text  :{text} \ntokens:{tokens} \nactual token:{actual_tokens}')

max len of tokens:16
text  :R2 starts A3 on C1030 at 2010-02-22 12:05 
tokens:[101, 30529, 4627, 30522, 2006, 30536, 2012, 2230, 1011, 6185, 1011, 2570, 2260, 1024, 5709, 102] 
actual token:['[CLS]', 'R2', 'starts', 'A3', 'on', 'C1030', 'at', '2010', '-', '02', '-', '22', '12', ':', '05', '[SEP]']


In [None]:

for x,row in df_nlp.iterrows():
    print(len(row['token']['input_ids'])