In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


Transformers and Accelerate packages should be installed or upgraded, otherwise Bert Trainer will give an error:

In [47]:
!pip install --upgrade transformers accelerate

[0m

In [48]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

In [49]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(train_df)

I decided to use DistilBert model fine-tuned on SS2: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

This model classifies the text giving the probabilities for negative and positive sentiment

In [50]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [51]:
tokz = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [52]:
def tokenize(input):
    return tokz(input["text"])

In [53]:
tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

Renaming the column 'target' as transformers library expects 'labels': 

In [54]:
tokenized_ds = tokenized_ds.rename_columns({'target': 'labels'})

In [55]:
dds = tokenized_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1904
    })
})

In [56]:
from transformers import TrainingArguments,Trainer

The metric F1 (provided by scikit-learn) is used to evaluate the competition submissions, hence we need to create a function to process it:

In [57]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='weighted')
    return {
        'f1': f1,
    }

In [58]:
bs = 128
epochs = 4

In [59]:
lr = 8e-5

In [60]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [61]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=compute_metrics)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.

In [62]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,No log,0.435363,0.826791
2,No log,0.404803,0.829823
3,No log,0.421728,0.832169
4,No log,0.435191,0.824234




TrainOutput(global_step=92, training_loss=0.3504586012467094, metrics={'train_runtime': 33.934, 'train_samples_per_second': 672.954, 'train_steps_per_second': 2.711, 'total_flos': 402753671893392.0, 'train_loss': 0.3504586012467094, 'epoch': 4.0})

In [63]:
eval_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [64]:
eval_ds = Dataset.from_pandas(eval_df).map(tokenize, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [65]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[-1.77832031,  1.32519531],
       [-2.24609375,  1.83007812],
       [-2.12890625,  1.66894531],
       ...,
       [-2.515625  ,  2.1171875 ],
       [-1.77832031,  1.51660156],
       [-2.06054688,  1.73339844]])

The predictions are in the format of 2D array, which contains arrays of 2 numbers, so-called "raw scores".

We need to transform them into probabilities, which can be done via softmax function: 
*softmax(score) = exp(score) / sum(exp(score))*

This function should be applied to each pair. As a result, we will get two numbers: negative and positive sentiment. The higher the number the higher the probability

In the cell below, probabilities is a 2D array, which contains our probabilities. Then we apply the argmax() function which gives the index of a higher value. It would mean, we will get 0 for negative (no disaster) and 1 for positive (disaster)

In [66]:
probabilities = np.exp(preds) / np.sum(np.exp(preds), axis=1, keepdims=True)

class_predictions = np.argmax(probabilities, axis=1)
probabilities

array([[0.04296247, 0.95703753],
       [0.01668906, 0.98331094],
       [0.0219273 , 0.9780727 ],
       ...,
       [0.00963365, 0.99036635],
       [0.03574581, 0.96425419],
       [0.02201123, 0.97798877]])

In [67]:
eval_df['preds'] = class_predictions

In [68]:
eval_df.drop(["keyword", "location", "text"],axis=1, inplace=True)

In [69]:
submissions_df = eval_df.rename(columns={'preds': 'target'})

In [71]:
submissions_df.to_csv('submissions.csv', index=False)