In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
np.random.seed(180)
random.seed(180)

#### [Natural Language Processing with Disaster Tweets Dataset](https://www.kaggle.com/competitions/nlp-getting-started/overview)

In [3]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df_train["target"].value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [6]:
text_id = df_test["id"].tolist()
test_x = df_test["text"].tolist()

In [7]:
df_0 = df_train[df_train["target"] == 0]["text"].tolist()
df_1 = df_train[df_train["target"] == 1]["text"].tolist()

train_x = df_0 + df_1
train_y = [0]*len(df_0) + [1]*len(df_1)

In [8]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def prepare_data(train_x, train_y):
    data = {"text": train_x, "label": train_y}
    dataset = Dataset.from_dict(data)
    return dataset.train_test_split(test_size=0.2, seed=42)

In [10]:
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
dataset = prepare_data(train_x, train_y)
tokenized_dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

Map: 100%|██████████| 6090/6090 [00:03<00:00, 1704.30 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 1706.03 examples/s]


In [15]:
training_args = TrainingArguments(
    output_dir="./full_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    push_to_hub=False,
)



In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
                                                  
 33%|███▎      | 381/1143 [12:08<20:30,  1.62s/it]

{'eval_loss': 0.39551469683647156, 'eval_accuracy': 0.8312541037426132, 'eval_precision': 0.8276422764227642, 'eval_recall': 0.7712121212121212, 'eval_f1': 0.7984313725490196, 'eval_runtime': 34.7293, 'eval_samples_per_second': 43.853, 'eval_steps_per_second': 1.382, 'epoch': 1.0}


 44%|████▎     | 500/1143 [17:06<21:28,  2.00s/it]  

{'loss': 0.4101, 'grad_norm': 4.495163917541504, 'learning_rate': 1.1251093613298338e-05, 'epoch': 1.31}


                                                  
 67%|██████▋   | 762/1143 [27:21<13:19,  2.10s/it]

{'eval_loss': 0.41025134921073914, 'eval_accuracy': 0.8338804990151018, 'eval_precision': 0.8330605564648118, 'eval_recall': 0.7712121212121212, 'eval_f1': 0.8009441384736428, 'eval_runtime': 43.7067, 'eval_samples_per_second': 34.846, 'eval_steps_per_second': 1.098, 'epoch': 2.0}


 87%|████████▋ | 1000/1143 [36:14<05:26,  2.28s/it] 

{'loss': 0.2774, 'grad_norm': 2.648693799972534, 'learning_rate': 2.502187226596676e-06, 'epoch': 2.62}


                                                   
100%|██████████| 1143/1143 [42:20<00:00,  2.09s/it]

{'eval_loss': 0.46887239813804626, 'eval_accuracy': 0.8279711096520026, 'eval_precision': 0.819935691318328, 'eval_recall': 0.7727272727272727, 'eval_f1': 0.7956318252730109, 'eval_runtime': 45.1728, 'eval_samples_per_second': 33.715, 'eval_steps_per_second': 1.063, 'epoch': 3.0}


100%|██████████| 1143/1143 [42:22<00:00,  2.22s/it]

{'train_runtime': 2542.0907, 'train_samples_per_second': 7.187, 'train_steps_per_second': 0.45, 'train_loss': 0.33256411948750547, 'epoch': 3.0}





TrainOutput(global_step=1143, training_loss=0.33256411948750547, metrics={'train_runtime': 2542.0907, 'train_samples_per_second': 7.187, 'train_steps_per_second': 0.45, 'total_flos': 396551506892256.0, 'train_loss': 0.33256411948750547, 'epoch': 3.0})

In [17]:
model.save_pretrained("./full_tuning_model")

In [18]:
def predict_tweet(tweet, tokenizer, model):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return pred

my_tweet = """
For those who have been waiting for this scene...

Now, a mass exodus of settlers from northern occupied Palestine have left their homes burning and are fleeing.
"""

prediction = predict_tweet(my_tweet, tokenizer, model)
print(f"The predicted label is: {prediction}")

The predicted label is: 1


In [19]:
preds = []
ids = []
for _, row in df_test.iterrows():
  tweet = row["text"]
  ids.append(row["id"])

  prediction = predict_tweet(tweet, tokenizer, model)
  preds.append(prediction)

In [20]:
df_submit = pd.DataFrame({"id": ids, "target": preds})

In [21]:
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [22]:
df_submit.to_csv("./results/distilbert.csv", index=False)