In [16]:
import pandas as pd

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

df_train.drop(columns=['id', 'keyword', 'location'], inplace=True)
df_test.drop(columns=['id', 'keyword', 'location'], inplace=True)
df_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [17]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df_train, test_size=0.2, random_state=42)
test = df_test

train.reset_index(inplace=True)
val.reset_index(inplace=True)
test.reset_index(inplace=True)

In [18]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()

ds['train'] = Dataset.from_pandas(train)
ds['validation'] = Dataset.from_pandas(val)
ds['test'] = Dataset.from_pandas(test)

print(ds)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'target'],
        num_rows: 6090
    })
    validation: Dataset({
        features: ['index', 'text', 'target'],
        num_rows: 1523
    })
    test: Dataset({
        features: ['index', 'text'],
        num_rows: 3263
    })
})


In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [20]:
def preprocess(batch):
    tokenizer_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=160)
    if 'target' in batch:
        tokenizer_batch['labels'] = batch['target']
    return tokenizer_batch

tokenized_ds = ds.map(preprocess, batched=True, batch_size=32)
tokenized_ds

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'target', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    validation: Dataset({
        features: ['index', 'text', 'target', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
    test: Dataset({
        features: ['index', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3263
    })
})

In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
import evaluate

accuracy = evaluate.load('accuracy')

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [24]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [26]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# trainer.train()

In [None]:
prediction = trainer.predict(tokenized_ds['validation'])
prediction.metrics

In [None]:
prediction = trainer.predict(tokenized_ds['test'])
prediction.metrics

In [27]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def display_confusion_matrix(y_true, y_pred, dataset_name):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        np.argmax(y_pred, axis=1),
        display_labels=["Not Disaster","Disaster"],
        cmap=plt.cm.Blues
    )

    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()
    f1_score = tp / (tp+((fn+fp)/2))

    disp.ax_.set_title("Confusion Matrix on " + dataset_name + " Dataset -- F1 Score: " + str(f1_score.round(2)))