### **HuggingFace Login**

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Import Libairies**

In [4]:
from IPython.display import display

import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset






### **Read Datasets**

In [5]:
'''
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col=0)
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col=0)

df_train = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/disaster_tweets/train.csv", index_col=0)
df_test = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/disaster_tweets/test.csv", index_col=0)
'''

df_train = pd.read_csv("C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/train.csv", index_col=0)
df_test = pd.read_csv("C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/test.csv", index_col=0)


### **Short Analysis**

In [6]:
print(f"Length - train {len(df_train)} - test {len(df_test)}")

Length - train 7613 - test 3263


### **Preprocess Datasets**

##### _Merge columns (full)_

In [7]:
keywords = pd.concat([
    pd.DataFrame(df_train["keyword"].value_counts()).rename(columns={"count":"train"}),
    pd.DataFrame(df_test["keyword"].value_counts()).rename(columns={"count":"test"})
    ],
    axis=1
)
keywords.head()

Unnamed: 0_level_0,train,test
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
fatalities,45,5
deluge,42,8
armageddon,42,8
damage,41,9
body%20bags,41,9


In [8]:
locations = pd.concat([
    pd.DataFrame(df_train["location"].value_counts()).rename(columns={"count":"train"}),
    pd.DataFrame(df_test["location"].value_counts()).rename(columns={"count":"test"})
    ],
    axis=1
)
locations.head()

Unnamed: 0_level_0,train,test
location,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,104.0,37.0
New York,71.0,38.0
United States,50.0,15.0
London,45.0,13.0
Canada,29.0,13.0


In [9]:
df_train_full = df_train.copy()
df_train_full['text'] = df_train_full.apply(lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(), axis=1)
df_test_full = df_test.copy()
df_test_full['text'] = df_test_full.apply(lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(), axis=1)


### **Model Assessment**

In [10]:
def evaluate_classifier(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    
    metrics_df = pd.DataFrame({
        'Value': [f1, precision, recall]
    }, index=['F1 Score', 'Precision', 'Recall'])
    
    cm_df = pd.DataFrame(cm, columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])
    
    return metrics_df, cm_df

### **BERT Tokenizer**

##### _BERT Tokenizer + split validation_

In [11]:
bert_tokenizer_full = BertTokenizer.from_pretrained("bert-base-uncased")

bert_encodings_full = bert_tokenizer_full(list(df_train_full['text']), truncation=True, padding=True, max_length=256)
bert_labels_full = torch.tensor(list(df_train_full['target']))

bert_input_ids_train_full, bert_input_ids_valid_full, bert_token_type_ids_train_full, bert_token_type_ids_valid_full, \
bert_attention_mask_train_full, bert_attention_mask_valid_full, bert_y_train_full, bert_y_valid_full = train_test_split(
    bert_encodings_full['input_ids'], 
    bert_encodings_full['token_type_ids'], 
    bert_encodings_full['attention_mask'], 
    bert_labels_full, 
    test_size=0.10, 
    stratify=bert_labels_full, 
    random_state=0
)

bert_train_encodings_full = {
    'input_ids': torch.tensor(bert_input_ids_train_full),
    'token_type_ids': torch.tensor(bert_token_type_ids_train_full),
    'attention_mask': torch.tensor(bert_attention_mask_train_full)
}

bert_valid_encodings_full = {
    'input_ids': torch.tensor(bert_input_ids_valid_full),
    'token_type_ids': torch.tensor(bert_token_type_ids_valid_full),
    'attention_mask': torch.tensor(bert_attention_mask_valid_full)
}

bert_train_dataset_full = Dataset.from_dict({
    "input_ids": bert_train_encodings_full['input_ids'],
    "attention_mask": bert_train_encodings_full['attention_mask'],
    "labels": bert_y_train_full
})

bert_valid_dataset_full = Dataset.from_dict({
    "input_ids": bert_valid_encodings_full['input_ids'],
    "attention_mask": bert_valid_encodings_full['attention_mask'],
    "labels": bert_y_valid_full
})

bert_test_encodings_full = bert_tokenizer_full(list(df_test_full['text']), truncation=True, padding=True, max_length=256)

bert_test_encodings_full = {
    key: torch.tensor(val) for key, val in bert_test_encodings_full.items()
}

bert_test_dataset_full = Dataset.from_dict({
    "input_ids": bert_test_encodings_full['input_ids'],
    "attention_mask": bert_test_encodings_full['attention_mask']
})

##### _Save HuggingFace BERT Tokenizer_

In [None]:
bert_tokenizer_full.save_pretrained("kaggle_disastertweets_bert_tokenizer")

### **BERT Training from scratch**

##### _BERT Transformers + local save_

In [None]:
bert_model_full = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./bert_logs",
    logging_steps=10,
    eval_strategy="epoch",
    report_to="none",
    no_cuda=True 
)

bert_trainer_full = Trainer(
    model=bert_model_full,
    args=training_args,
    train_dataset=bert_train_dataset_full,
    eval_dataset=bert_valid_dataset_full
)

bert_trainer_full.train()

bert_trainer_full.save_model("bert_model") # Local

##### _Save HugggingFace trained BERT Transformers_ 

In [None]:
bert_trainer_full.push_to_hub("yanncauchepin/kaggle_disastertweets_bert_model")
# bert_trainer_full.model.save_pretrained("kaggle_disastertweets_bert_trainer")

### **BERT Loading from pretrained**

In [12]:
bert_model_full = BertForSequenceClassification.from_pretrained("bert_model")

bert_trainer_full = Trainer(
    model=bert_model_full
)

### **BERT Assessment**

In [13]:
bert_predictions_full = bert_trainer_full.predict(bert_valid_dataset_full)
bert_logits_full = bert_predictions_full.predictions
bert_y_pred_full = np.argmax(bert_logits_full, axis=1)

bert_trainer_full_assessement = evaluate_classifier(bert_y_valid_full.numpy(), bert_y_pred_full)
display(bert_trainer_full_assessement[0])
display(bert_trainer_full_assessement[1])

AttributeError: 'BertForSequenceClassification' object has no attribute 'predict'

### **BERT Test Predictions**

##### _BERT Test Predictions_

In [None]:
bert_test_predictions_full = bert_trainer_full.predict(bert_test_dataset_full)
bert_test_logits_full = bert_test_predictions_full.predictions
bert_test_y_pred_full = np.argmax(bert_test_logits_full, axis=1)

bert_test_submission_full = pd.DataFrame({
    'id': df_test_full.index,
    'target': bert_test_y_pred_full.flatten()
})

##### _Save HuggingFace BERT Test Predictions_

In [None]:
hf_bert_test_submission_full = Dataset.from_pandas(bert_test_submission_full)
hf_bert_test_submission_full.push_to_hub("yanncauchepin/kaggle_disastertweets_bert_submission_df")