This will use Hugging Face transformers on the Disaster Tweets dataset. The basis for this work is from the fast.ai course on NLP at: https://course.fast.ai/Lessons/lesson4.html 

In [1]:
# To install the Hugging Face datasets
! pip install -q datasets
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression

import datasets
from datasets import Dataset,DatasetDict
from datasets import load_metric
import evaluate

import re
import emoji
import nltk

from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments,Trainer
from transformers import set_seed

import warnings, logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print('Training Set Shape = {}'.format(train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [4]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In researching a way to clean up the text in these tweets, the following Stack Overflow post was extremely helpful: https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

In [6]:
train_clean_tweets = []
for tweet in train['text']:
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    emojis = emoji.distinct_emoji_list(tweet)
    tweet = ''.join(c for c in tweet if c not in emojis) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    #tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         #if w.lower() in tweet or not w.isalpha())
    train_clean_tweets.append(tweet)
    
train['clean_text'] = train_clean_tweets

In [7]:
train.head(25)

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,RockyFire Update => California Hwy. 20 closed ...
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster Heavy rain causes flash floodin...
7,13,,,I'm on top of the hill and I can see a fire in...,1,I'm on top of the hill and I can see a fire in...
8,14,,,There's an emergency evacuation happening now ...,1,There's an emergency evacuation happening now ...
9,15,,,I'm afraid that the tornado is coming to our a...,1,I'm afraid that the tornado is coming to our a...


In [8]:
test_clean_tweets = []
for tweet in test['text']:
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    emojis = emoji.distinct_emoji_list(tweet)
    tweet = ''.join(c for c in tweet if c not in emojis) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    #tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         #if w.lower() in tweet or not w.isalpha())
    test_clean_tweets.append(tweet)
    
test['clean_text'] = test_clean_tweets

In [9]:
test

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,Green Line derailment in Chicago
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,MEG issues Hazardous Weather Outlook (HWO)


The following error was received when starting the trainer: "RuntimeError: "mse_cuda" not implemented for 'Long'". From a Google search it appears the target data needs to be a float datatype. 

In [10]:
train['target'] = train['target'].astype(float)

In [11]:
train.dtypes

id              int64
keyword        object
location       object
text           object
target        float64
clean_text     object
dtype: object

There are no missing values for the text and target in the train dataset and the text in the test dataset. There are a large amount of missing values in location while there are a small amount for keyword.

In [12]:
train.isna().sum()

id               0
keyword         61
location      2533
text             0
target           0
clean_text       0
dtype: int64

In [13]:
test.isna().sum()

id               0
keyword         26
location      1105
text             0
clean_text       0
dtype: int64

Hugging Faces expects the input data to actually be titled 'input', so the text column headings are changed. 

In [14]:
train.rename(columns={'clean_text':'input'}, inplace=True)
test.rename(columns={'clean_text':'input'}, inplace=True)

In [15]:
train

Unnamed: 0,id,keyword,location,text,target,input
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1.0,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,Just got sent this photo from Ruby Alaska as s...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1.0,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1.0,ahrary The out of control wild fires in Calif...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1.0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.
7611,10872,,,Police investigating after an e-bike collided ...,1.0,Police investigating after an e-bike collided ...


Looking at the number of unique values in each column of the train dataset it shows that there are 7613 total columns, but only 6922 of the input columns are unique, which is a total of 791 rows. That is a lot. The question whether a unique input value with many occurances are all labeled with the same target values.   

In [16]:
train.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
input       6922
dtype: int64

To explore this a new column called 'unique_input' is created to be able to explore some of the larger occurances and then possible be used as a basis to correct any issues. 

In [17]:
train['unique_input'] = pd.factorize(train['input'])[0] + 1

In [18]:
train

Unnamed: 0,id,keyword,location,text,target,input,unique_input
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask. Canada,2
2,5,,,All residents asked to 'shelter in place' are ...,1.0,All residents asked to 'shelter in place' are ...,3
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,"13,000 people receive wildfires evacuation ord...",4
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,Just got sent this photo from Ruby Alaska as s...,5
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1.0,Two giant cranes holding a bridge collapse int...,1080
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1.0,ahrary The out of control wild fires in Calif...,6667
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1.0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,6497
7611,10872,,,Police investigating after an e-bike collided ...,1.0,Police investigating after an e-bike collided ...,1573


Looking at the top five unique occurances, only the 4th one, 4061, had variations in the target values. It doesn't appear to be a disaster, but 5 out of 17 occurances were coded as a disaster. 

In [19]:
train['unique_input'].value_counts().nlargest(5)

5351    24
6848    20
4862    19
4061    17
447     15
Name: unique_input, dtype: int64

In [20]:
print(train.loc[train['unique_input'] == 4061])

        id    keyword             location  \
4391  6243  hijacking    perth, australia    
4392  6244  hijacking             Mongolia   
4393  6245  hijacking  brisbane, australia   
4394  6246  hijacking                China   
4396  6248  hijacking  Chiyoda Ward, Tokyo   
4397  6253  hijacking                 rome   
4399  6255  hijacking         EastCarolina   
4400  6256  hijacking               Brazil   
4403  6259  hijacking                  NaN   
4404  6261  hijacking               France   
4405  6262  hijacking                  NaN   
4407  6265  hijacking                tokyo   
4408  6267  hijacking                china   
4412  6272  hijacking               Brazil   
4414  6274  hijacking                  NaN   
4415  6276  hijacking                Japan   
4420  6283  hijacking                  NaN   

                                                   text  target  \
4391  #hot  Funtenna: hijacking computers to send da...     0.0   
4392  #hot  Funtenna: hijacking compu

In [21]:
train['unique_input'].value_counts().ne(1).sum()

314

In [22]:
train_unique_mode = train.groupby('unique_input').agg({'target': lambda x: x.value_counts().index[0]}).reset_index()

In [23]:
train

Unnamed: 0,id,keyword,location,text,target,input,unique_input
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask. Canada,2
2,5,,,All residents asked to 'shelter in place' are ...,1.0,All residents asked to 'shelter in place' are ...,3
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,"13,000 people receive wildfires evacuation ord...",4
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,Just got sent this photo from Ruby Alaska as s...,5
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1.0,Two giant cranes holding a bridge collapse int...,1080
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1.0,ahrary The out of control wild fires in Calif...,6667
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1.0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,6497
7611,10872,,,Police investigating after an e-bike collided ...,1.0,Police investigating after an e-bike collided ...,1573


In [24]:
train_unique_mode

Unnamed: 0,unique_input,target
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
6917,6918,1.0
6918,6919,1.0
6919,6920,1.0
6920,6921,1.0


In [25]:
train['new_target'] = train['unique_input'].map(train_unique_mode.set_index('unique_input')['target'])

In [26]:
train.head(25)

Unnamed: 0,id,keyword,location,text,target,input,unique_input,new_target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,Our Deeds are the Reason of this earthquake Ma...,1,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,Forest fire near La Ronge Sask. Canada,2,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0,All residents asked to 'shelter in place' are ...,3,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,"13,000 people receive wildfires evacuation ord...",4,1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,Just got sent this photo from Ruby Alaska as s...,5,1.0
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1.0,RockyFire Update => California Hwy. 20 closed ...,6,1.0
6,10,,,#flood #disaster Heavy rain causes flash flood...,1.0,flood disaster Heavy rain causes flash floodin...,7,1.0
7,13,,,I'm on top of the hill and I can see a fire in...,1.0,I'm on top of the hill and I can see a fire in...,8,1.0
8,14,,,There's an emergency evacuation happening now ...,1.0,There's an emergency evacuation happening now ...,9,1.0
9,15,,,I'm afraid that the tornado is coming to our a...,1.0,I'm afraid that the tornado is coming to our a...,10,1.0


It looks like there are 89 rows where the new target is not equal to the original target, which means 89 rows were changed based on looking at the mode of unique tweets with more than one occurance. 

In [27]:
len(train.query('new_target != target'))

89

Need to convert the new target values to floating values. 

In [28]:
train['new_target'] = train['new_target'].astype(float)

To turn the train and test dataframes into Hugging Face datasets.

In [29]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [30]:
train_dataset

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'unique_input', 'new_target'],
    num_rows: 7613
})

In [31]:
test_dataset

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'input'],
    num_rows: 3263
})

Neural networks work with numbers. So the tokenization and numericilization do that. First tokenization splits the text strings into separate words. Each unique word then gets a number; which is numerilization.

The challenge is that the larger the vocabulary, the more memory it will use and the slower the ML process could be. So now these words are broken into subwords so that there is less overall vocabulary used. Overall we call these distinct vocabulary pieces 'tokens'.

The details on how to tokenize and numericilize depends on what model is used. Hugging Face has a libary of hundreds of pretrained models. It actually has over 128k models at: https://huggingface.co/models

These models vary in a number of ways. There's a variety of different architectures and each of these architectures could be trained on different corpuses to solve different problems. So you can go to this Hugging Face site and 
type in 'disasters' and find that 2 different models come up. More can come up on other subjects. For example, if 'law' is typed in, 224 models are retrieved. 

However there are just a lot of good models that work for a lot of things a lot of the time. Deberta V3 is one of them. So a small version is used below as a starting point (so it will be faster to try things with).

In [32]:
model_nm = 'microsoft/deberta-v3-small'

To tell Hugging Face to tokenize something the same way as it was originally done in the model, we use AutoTokenizer. That way what is done here will match what was originally done on the pretrained model. 

In [33]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

To see how it works, you can pass a string in and see how it tokenizes the string. It puts some things into words and many things into subsets of those words as it categorizes parts of the text as tokens. The underscores below represent the start of a word. For the second sentence below we can see how it breaks down (or doesn't break down) more complicated words. These are all part of the unique tokens that were created when this model was first trained. And each of these tokens have their own number.

In [34]:
tokz.tokenize("Welcome to one of our 'Getting Started' competitions!")

['▁Welcome',
 '▁to',
 '▁one',
 '▁of',
 '▁our',
 "▁'",
 'Getting',
 '▁Started',
 "'",
 '▁competitions',
 '!']

Below is a simple function that takes a document x and tokenizes its 'input' column if it has one. 

In [35]:
def tok_func(x): return tokz(x['input'])

The .map will parallize this function since tokenization can take a long time. Batched=True sets up the batching process so it can do a bunch at a time. This will be done for both the train and test Hugging Face datasets. 

In [36]:
tok_ds_train = train_dataset.map(tok_func, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [37]:
tok_ds_test = test_dataset.map(tok_func, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [38]:
tok_ds_train

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'unique_input', 'new_target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7613
})

This process creates a new row with its IDs. Here it's called 'input_ids'. Creating the variable below allows us to see these two columns for the first row. These numbers are the position in the vocabulary of the model for each of the tokens. So here we've turned a string into a list of numbers.

In [39]:
row = tok_ds_train[0]
row['input'], row['input_ids']

('Our Deeds are the Reason of this earthquake May ALLAH Forgive us all',
 [1,
  581,
  65453,
  281,
  262,
  18037,
  265,
  291,
  10612,
  903,
  4924,
  17018,
  43632,
  381,
  305,
  2])

Hugging Face expects that your target is a column called 'labels'. So we have to rename our 'new_target' column as 'labels' as done below.

In [40]:
tok_ds_train = tok_ds_train.rename_columns({'new_target':'labels'})

To prevent against overfitting we remove a certain percent of our datapoints from the testing dataset and call that a validation dataset. We then fit our model against only the testing datapoints; those datapoints we haven't removed.

Fastai always makes sure you have a validation dataset. Below we use Hugging Faces function to do this.

We will hold out 25% of the data as a validation set. Here we are using a random split just to keep things simple. We'll use what's labeled below as 'train' to actually train with. And then it will measure the results on the 'test' dataset which is composed of the seperated out 25% of the original data. It's really a validation set, but Hugging Faces calls it 'test'.

In [41]:
train_dds = tok_ds_train.train_test_split(0.25, seed=42)
train_dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'unique_input', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'unique_input', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1904
    })
})

For the patent dataset we want to report the correlation after each epoch because we want to know how our training is going. Also, Transformers expects metrics to be returned as a dict, since that way the trainer knows what data to use. It will use the keys of the dictionary to label each metric.

So he created this function to do that. It takes the correlation and ties it to the label 'pearson'.

In [42]:
metric = load_metric("f1")

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [43]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    return metric.compute(predictions=preds, references=labels)

We want to pick a batch size that fits our GPU and a small number of epochs so we can run experiments quickly. So here we will pass on 128 rows at a time to our GPU as it processes the dataset in parallel. The larger your batch size the more it can process in parallel and will be faster. But if you make it too big, you'll get an 'out of memory' error on you GPU.

In [44]:
bs = 128
epochs = 4

Learning rate is the most important hyperparameter. Fastai can figure out the best learning rate for you. Hugging Face doesn't do this, so you'll have to use trial and error. Below he uses one he found that works best.

What he does is use a really low training rate and then double it and keep doubling it until it falls apart. Then use that last highest value that worked.

In [45]:
lr = 8e-5

In [46]:
set_seed(42)

Transformers uses the TrainingArguments class to set up arguments. He says that most the values below can be used for most situations. The only ones we really need to worry about are what we set up above: batches, epochs and learning rate.

In [47]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, 
                         lr_scheduler_type='cosine', fp16=True,
                         evaluation_strategy="epoch", 
                         per_device_train_batch_size=bs, 
                         per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs, weight_decay=0.01, 
                         report_to='none')

We can now create our model using the Deberta V3 pretrained model we found earlier. Since we are trying to do classification of sequences, we use the appropriate process from Hugging Face. Here it will create a classifcation of sequences from a pretrained model. We have one label, which is the score, so num_labels=1.

Then we create the trainer using the training dataset we labeled 'dds' that has been split between the testing data called 'train' and the validation data called 'test'. For compute_metrics we are using the corr_d function that we created earlier.

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=train_dds['train'], 
                  eval_dataset=train_dds['test'],
                  tokenizer=tokz, compute_metrics=compute_metrics)
                  #compute_metrics=corr_d)

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

And we can train the data.

In [49]:
trainer.train();

Epoch,Training Loss,Validation Loss,F1
1,No log,0.145391,0.516712
2,No log,0.124519,0.36747
3,No log,0.117715,0.087059
4,No log,0.126504,0.715724


Now that we have a trained model, now we want to submit our predictions to Kaggle. So we want to pass in our predictions from the Kaggle test file that we've trained as a floating number.

In [50]:
preds = trainer.predict(tok_ds_test).predictions.astype(float)
preds

array([[1.11184776],
       [1.01712143],
       [1.04776537],
       ...,
       [1.13682044],
       [1.06740928],
       [0.95314896]])

The first submission failed, possibly because the results were not integers. 

In [51]:
preds = preds.round().astype(int).squeeze()
preds

array([1, 1, 1, ..., 1, 1, 1])

Now we need to convert the Hugging Face dataset back to a csv file, add in the IDs from the testing data IDs, and add in the predictions as a 'score' column.

In [52]:
submission = datasets.Dataset.from_dict({
    'id': tok_ds_test['id'],
    'target': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

22746