### **HuggingFace Login**

In [18]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Import Libairies**

In [19]:
from IPython.display import display

import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import Dataset



### **Read Datasets**

In [20]:
'''
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col=0)
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col=0)

df_train = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/disaster_tweets/train.csv", index_col=0)
df_test = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/disaster_tweets/test.csv", index_col=0)
'''

df_train = pd.read_csv("C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/train.csv", index_col=0)
df_test = pd.read_csv("C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/test.csv", index_col=0)


### **Short Analysis**

In [21]:
print(f"Length - train {len(df_train)} - test {len(df_test)}")

Length - train 7613 - test 3263


### **Preprocess Datasets**

##### _Merge columns (full)_

In [22]:
keywords = pd.concat([
    pd.DataFrame(df_train["keyword"].value_counts()).rename(columns={"count":"train"}),
    pd.DataFrame(df_test["keyword"].value_counts()).rename(columns={"count":"test"})
    ],
    axis=1
)
keywords.head()

Unnamed: 0_level_0,train,test
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
fatalities,45,5
deluge,42,8
armageddon,42,8
damage,41,9
body%20bags,41,9


In [23]:
locations = pd.concat([
    pd.DataFrame(df_train["location"].value_counts()).rename(columns={"count":"train"}),
    pd.DataFrame(df_test["location"].value_counts()).rename(columns={"count":"test"})
    ],
    axis=1
)
locations.head()

Unnamed: 0_level_0,train,test
location,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,104.0,37.0
New York,71.0,38.0
United States,50.0,15.0
London,45.0,13.0
Canada,29.0,13.0


In [24]:
df_train_full = df_train.copy()
df_train_full['text'] = df_train_full.apply(lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(), axis=1)
df_test_full = df_test.copy()
df_test_full['text'] = df_test_full.apply(lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(), axis=1)


### **Model Assessment**

In [25]:
def evaluate_classifier(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    
    metrics_df = pd.DataFrame({
        'Value': [f1, precision, recall]
    }, index=['F1 Score', 'Precision', 'Recall'])
    
    cm_df = pd.DataFrame(cm, columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])
    
    return metrics_df, cm_df

### **DISTIL BERT Tokenizer**

##### _DISTIL BERT Tokenizer + split validation_

In [26]:
distill_bert_tokenizer_full = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

distill_bert_encodings_full = distill_bert_tokenizer_full(list(df_train_full['text']), truncation=True, padding=True, max_length=256)
distill_bert_labels_full = torch.tensor(list(df_train_full['target']))

distill_bert_input_ids_train_full, distill_bert_input_ids_valid_full, \
distill_bert_attention_mask_train_full, distill_bert_attention_mask_valid_full, distill_bert_y_train_full, distill_bert_y_valid_full = train_test_split(
    distill_bert_encodings_full['input_ids'], 
    distill_bert_encodings_full['attention_mask'], 
    distill_bert_labels_full, 
    test_size=0.15, 
    stratify=distill_bert_labels_full, 
    random_state=0
)

distill_bert_train_encodings_full = {
    'input_ids': torch.tensor(distill_bert_input_ids_train_full),
    'attention_mask': torch.tensor(distill_bert_attention_mask_train_full)
}

distill_bert_valid_encodings_full = {
    'input_ids': torch.tensor(distill_bert_input_ids_valid_full),
    'attention_mask': torch.tensor(distill_bert_attention_mask_valid_full)
}

distill_bert_train_dataset_full = Dataset.from_dict({
    "input_ids": distill_bert_train_encodings_full['input_ids'],
    "attention_mask": distill_bert_train_encodings_full['attention_mask'],
    "labels": distill_bert_y_train_full
})

distill_bert_valid_dataset_full = Dataset.from_dict({
    "input_ids": distill_bert_valid_encodings_full['input_ids'],
    "attention_mask": distill_bert_valid_encodings_full['attention_mask'],
    "labels": distill_bert_y_valid_full
})

distill_bert_test_encodings_full = distill_bert_tokenizer_full(list(df_test_full['text']), truncation=True, padding=True, max_length=256)

distill_bert_test_encodings_full = {
    key: torch.tensor(val) for key, val in distill_bert_test_encodings_full.items()
    }

distill_bert_test_dataset_full = Dataset.from_dict({
    "input_ids": distill_bert_test_encodings_full['input_ids'],
    "attention_mask": distill_bert_test_encodings_full['attention_mask']
})

##### _Save HuggingFace DISTIL BERT Tokenizer_

In [27]:
distill_bert_tokenizer_full.push_to_hub("yanncauchepin/kaggle_disastertweets_distill_bert_tokenizer")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/yanncauchepin/kaggle_disastertweets_distill_bert_tokenizer/commit/e2566ad7feb72933925f6db6828bef8465007d47', commit_message='Upload tokenizer', commit_description='', oid='e2566ad7feb72933925f6db6828bef8465007d47', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yanncauchepin/kaggle_disastertweets_distill_bert_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='yanncauchepin/kaggle_disastertweets_distill_bert_tokenizer'), pr_revision=None, pr_num=None)

### **DISTIL BERT Training from scratch**

##### _DISTIL BERT Transformers + local save_

In [28]:
distill_bert_model_full = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    hub_model_id="kaggle_disastertweets_distill_bert_model",
    output_dir="./distil_bert_results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./distil_bert_logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

distill_bert_trainer_full = Trainer(
    model=distill_bert_model_full,
    args=training_args,
    train_dataset=distill_bert_train_dataset_full,
    eval_dataset=distill_bert_valid_dataset_full
)

distill_bert_trainer_full.train()

distill_bert_trainer_full.save_model("distil_bert_model") # Local

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/8090 [00:00<?, ?it/s]

{'loss': 0.6927, 'grad_norm': 3.3131556510925293, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.693, 'grad_norm': 2.104788303375244, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 0.6855, 'grad_norm': 4.590396404266357, 'learning_rate': 3e-06, 'epoch': 0.02}
{'loss': 0.7106, 'grad_norm': 3.161708116531372, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.6996, 'grad_norm': 2.9289047718048096, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 0.6749, 'grad_norm': 2.7170984745025635, 'learning_rate': 6e-06, 'epoch': 0.04}
{'loss': 0.6808, 'grad_norm': 3.215543270111084, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.68, 'grad_norm': 1.9614405632019043, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.05}
{'loss': 0.6683, 'grad_norm': 3.3980209827423096, 'learning_rate': 9e-06, 'epoch': 0.06}
{'loss': 0.6785, 'grad_norm': 3.452338695526123, 'learning_rate': 1e-05, 'epoch': 0.06}
{'loss': 0.6555, 'grad_norm': 2.5501

  0%|          | 0/286 [00:00<?, ?it/s]

{'eval_loss': 0.4777453541755676, 'eval_runtime': 3.0869, 'eval_samples_per_second': 369.952, 'eval_steps_per_second': 92.65, 'epoch': 1.0}
{'loss': 0.3652, 'grad_norm': 3.71907114982605, 'learning_rate': 4.262187088274045e-05, 'epoch': 1.0}
{'loss': 0.4491, 'grad_norm': 12.228489875793457, 'learning_rate': 4.255599472990778e-05, 'epoch': 1.01}
{'loss': 0.6843, 'grad_norm': 0.6215776205062866, 'learning_rate': 4.24901185770751e-05, 'epoch': 1.01}
{'loss': 0.3914, 'grad_norm': 0.6190232038497925, 'learning_rate': 4.242424242424243e-05, 'epoch': 1.02}
{'loss': 0.4148, 'grad_norm': 6.3058085441589355, 'learning_rate': 4.235836627140975e-05, 'epoch': 1.03}
{'loss': 0.3264, 'grad_norm': 303.2306823730469, 'learning_rate': 4.229249011857708e-05, 'epoch': 1.03}
{'loss': 1.056, 'grad_norm': 200.380126953125, 'learning_rate': 4.222661396574441e-05, 'epoch': 1.04}
{'loss': 0.4985, 'grad_norm': 503.67108154296875, 'learning_rate': 4.216073781291173e-05, 'epoch': 1.04}
{'loss': 0.5403, 'grad_norm'

  0%|          | 0/286 [00:00<?, ?it/s]

{'eval_loss': 0.6021955013275146, 'eval_runtime': 3.1555, 'eval_samples_per_second': 361.905, 'eval_steps_per_second': 90.635, 'epoch': 2.0}
{'loss': 0.4803, 'grad_norm': 10.148124694824219, 'learning_rate': 3.194993412384717e-05, 'epoch': 2.0}
{'loss': 0.4048, 'grad_norm': 0.45618295669555664, 'learning_rate': 3.188405797101449e-05, 'epoch': 2.01}
{'loss': 0.4508, 'grad_norm': 0.6213948726654053, 'learning_rate': 3.181818181818182e-05, 'epoch': 2.01}
{'loss': 0.407, 'grad_norm': 0.26481470465660095, 'learning_rate': 3.175230566534915e-05, 'epoch': 2.02}
{'loss': 0.3714, 'grad_norm': 0.5197944641113281, 'learning_rate': 3.168642951251647e-05, 'epoch': 2.03}
{'loss': 0.4553, 'grad_norm': 0.34316882491111755, 'learning_rate': 3.16205533596838e-05, 'epoch': 2.03}
{'loss': 0.301, 'grad_norm': 9.573576927185059, 'learning_rate': 3.155467720685112e-05, 'epoch': 2.04}
{'loss': 0.1969, 'grad_norm': 10.143568992614746, 'learning_rate': 3.148880105401845e-05, 'epoch': 2.05}
{'loss': 0.2671, 'gra

  0%|          | 0/286 [00:00<?, ?it/s]

{'eval_loss': 0.7443392276763916, 'eval_runtime': 3.1732, 'eval_samples_per_second': 359.887, 'eval_steps_per_second': 90.129, 'epoch': 3.0}
{'loss': 0.1802, 'grad_norm': 0.3580894470214844, 'learning_rate': 2.127799736495389e-05, 'epoch': 3.0}
{'loss': 0.1268, 'grad_norm': 0.09043201804161072, 'learning_rate': 2.1212121212121215e-05, 'epoch': 3.01}
{'loss': 0.264, 'grad_norm': 6.902551174163818, 'learning_rate': 2.114624505928854e-05, 'epoch': 3.02}
{'loss': 0.1141, 'grad_norm': 3.9793522357940674, 'learning_rate': 2.1080368906455864e-05, 'epoch': 3.02}
{'loss': 0.138, 'grad_norm': 0.061234794557094574, 'learning_rate': 2.101449275362319e-05, 'epoch': 3.03}
{'loss': 0.0343, 'grad_norm': 25.054922103881836, 'learning_rate': 2.0948616600790517e-05, 'epoch': 3.03}
{'loss': 0.1405, 'grad_norm': 0.05548339709639549, 'learning_rate': 2.088274044795784e-05, 'epoch': 3.04}
{'loss': 0.1491, 'grad_norm': 0.5225006937980652, 'learning_rate': 2.0816864295125166e-05, 'epoch': 3.05}
{'loss': 0.4245

  0%|          | 0/286 [00:00<?, ?it/s]

{'eval_loss': 0.897905170917511, 'eval_runtime': 3.1616, 'eval_samples_per_second': 361.206, 'eval_steps_per_second': 90.46, 'epoch': 4.0}
{'loss': 0.0974, 'grad_norm': 0.13111251592636108, 'learning_rate': 1.0606060606060607e-05, 'epoch': 4.0}
{'loss': 0.057, 'grad_norm': 0.1346280872821808, 'learning_rate': 1.0540184453227932e-05, 'epoch': 4.01}
{'loss': 0.0724, 'grad_norm': 0.03194446116685867, 'learning_rate': 1.0474308300395258e-05, 'epoch': 4.02}
{'loss': 0.2246, 'grad_norm': 0.35456565022468567, 'learning_rate': 1.0408432147562583e-05, 'epoch': 4.02}
{'loss': 0.457, 'grad_norm': 0.03798054903745651, 'learning_rate': 1.0342555994729908e-05, 'epoch': 4.03}
{'loss': 0.0075, 'grad_norm': 2.116264820098877, 'learning_rate': 1.0276679841897234e-05, 'epoch': 4.04}
{'loss': 0.3864, 'grad_norm': 9.093986511230469, 'learning_rate': 1.0210803689064559e-05, 'epoch': 4.04}
{'loss': 0.2781, 'grad_norm': 3.439152240753174, 'learning_rate': 1.0144927536231885e-05, 'epoch': 4.05}
{'loss': 0.0028

  0%|          | 0/286 [00:00<?, ?it/s]

{'eval_loss': 1.066947102546692, 'eval_runtime': 3.3301, 'eval_samples_per_second': 342.934, 'eval_steps_per_second': 85.884, 'epoch': 5.0}
{'train_runtime': 552.214, 'train_samples_per_second': 58.591, 'train_steps_per_second': 14.65, 'train_loss': 0.35860296717012585, 'epoch': 5.0}


##### _Save HugggingFace trained DISTIL BERT Transformers_ 

In [29]:
distill_bert_trainer_full.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yanncauchepin/kaggle_disastertweets_distill_bert_model/commit/62fa20aa9e3c22e653fe87041413c880aba156ad', commit_message='End of training', commit_description='', oid='62fa20aa9e3c22e653fe87041413c880aba156ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yanncauchepin/kaggle_disastertweets_distill_bert_model', endpoint='https://huggingface.co', repo_type='model', repo_id='yanncauchepin/kaggle_disastertweets_distill_bert_model'), pr_revision=None, pr_num=None)

### **DISTIL BERT Loading from pretrained**

In [30]:
distill_bert_model_full = DistilBertForSequenceClassification.from_pretrained("distil_bert_model")

training_args = TrainingArguments(
    hub_model_id="kaggle_disastertweets_distill_bert_model",
    output_dir="./distil_bert_results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./distil_bert_logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

distill_bert_trainer_full = Trainer(
    model=distill_bert_model_full,
    args=training_args,
    train_dataset=distill_bert_train_dataset_full,
    eval_dataset=distill_bert_valid_dataset_full
)

### **DISTIL BERT Assessment**

In [31]:
distill_bert_predictions_full = distill_bert_trainer_full.predict(distill_bert_valid_dataset_full)
distill_bert_logits_full = distill_bert_predictions_full.predictions
distill_bert_y_pred_full = np.argmax(distill_bert_logits_full, axis=1)

distill_bert_trainer_full_assessement = evaluate_classifier(distill_bert_y_valid_full.numpy(), distill_bert_y_pred_full)
display(distill_bert_trainer_full_assessement[0])
display(distill_bert_trainer_full_assessement[1])

  0%|          | 0/286 [00:00<?, ?it/s]

Unnamed: 0,Value
F1 Score,0.831565
Precision,0.835722
Recall,0.833625


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,590,61
Actual Positive,129,362


### **DISTIL BERT Test Predictions**

##### _DISTIL BERT Test Predictions_

In [32]:
distill_bert_test_predictions_full = distill_bert_trainer_full.predict(distill_bert_test_dataset_full)
distill_bert_test_logits_full = distill_bert_test_predictions_full.predictions
distill_bert_test_y_pred_full = np.argmax(distill_bert_test_logits_full, axis=1)

distill_bert_test_submission_full = pd.DataFrame({
    'id': df_test_full.index,
    'target': distill_bert_test_y_pred_full.flatten()
})

  0%|          | 0/816 [00:00<?, ?it/s]

##### _Save HuggingFace DISTIL BERT Test Predictions_

In [33]:
hf_distill_bert_test_submission_full = Dataset.from_pandas(distill_bert_test_submission_full)
hf_distill_bert_test_submission_full.push_to_hub("yanncauchepin/kaggle_disastertweets_distill_bert_submission_df")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/299 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yanncauchepin/kaggle_disastertweets_distill_bert_submission_df/commit/449c9aa23fa55ab964fcc652f34038b8a4e84bf4', commit_message='Upload dataset', commit_description='', oid='449c9aa23fa55ab964fcc652f34038b8a4e84bf4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yanncauchepin/kaggle_disastertweets_distill_bert_submission_df', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yanncauchepin/kaggle_disastertweets_distill_bert_submission_df'), pr_revision=None, pr_num=None)