# NLP with Disaster Tweets Kaggle
1. Set up the experimentation framework
    - Subset the train data into train/val/test
    - Optimize the model on the train dataset, using val to evaluate performance per epoch
    - Evaluate performance on test
    - If performance is good enough submit that to kaggle and then see what the resulting output is
    - I should be able to track the performance of a given set of hyperparameters or design decisions through the whole process. I.e. I'll get a train, val, test performance numbers, then I'll retrain it on the whole dataset using that approach, then I'll submit that to kaggle and evaluate the leaderboard performance for that submission and add it to the experiment tracker.
2. Ok, that's all great. Let's see if I can structure my code in such a way that I can move the sort of repeatable, reuseable part of my code to one portion, and then have the custom code for reading in and preprocessing the data in a different place. That way I could theoretically swap out the preprocessing code and keep the model trainin code if I wanted.


In [2]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os

import torch

In [3]:
cfg = OmegaConf.create({
    
})

In [4]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'
sample_submission_path = '../data/sample_submission.csv'

In [5]:
df_train = pl.read_csv(train_path)
df_test = pl.read_csv(test_path)

In [20]:
df_train.shape, df_test.shape

((7613, 5), (3263, 4))

In [6]:
df_train.sample(5)

id,keyword,location,text,target
i64,str,str,str,i64
9565,"""thunder""","""Enfield, UK""","""#PlayingNow #BLOODBOUND Seven …",0
2743,"""crushed""",,"""So many Youtube commenters say…",1
1845,"""burned""","""956""","""It hurts for me to eat cause i…",0
8212,"""riot""",,"""@AcaciaPenn I'll start a big a…",0
2730,"""crushed""","""Guayaquil""","""I crushed a 3.1 km run with a …",0


In [7]:
df_test.sample(5)

id,keyword,location,text
i64,str,str,str
7726,"""panicking""","""9.25.14?8.5.15?10.6.15 | gen?""","""this is from my show last nigh…"
4839,"""evacuation""","""NIFC""","""#MadRiverComplex #CA #CASRF ht…"
4746,"""evacuate""",,"""@yourgirlhaileyy leaveevacuate…"
8076,"""rescue""",,"""@wcvh01 @1233newcastle @aaronk…"
2247,"""chemical%20emergency""",,"""@bendwavy emergency chemical r…"


In [8]:
df_train.null_count()

id,keyword,location,text,target
u32,u32,u32,u32,u32
0,61,2533,0,0


In [9]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

In [None]:
def train_wordpiece_tokenizer_from_dataset(
    csv_file_path,
    vocab_size=30000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    output_path="tokenizer.json"
):
    # Load the CSV file as a dataset using Hugging Face Datasets library.
    # This creates a default "train" split.
    dataset = load_dataset("csv", data_files=csv_file_path)
    
    # Ensure that the dataset contains a 'text' column.
    if "text" not in dataset["train"].column_names:
        raise ValueError("The CSV file must contain a 'text' column.")
    
    # Extract texts from the dataset.
    texts = dataset["train"]["text"]
    
    # Create a WordPiece tokenizer with a designated unknown token.
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    
    # Use a simple whitespace pre-tokenizer.
    tokenizer.pre_tokenizer = Whitespace()
    
    # Initialize the trainer with the desired vocabulary size and special tokens.
    trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
    
    # Train the tokenizer on the extracted texts.
    tokenizer.train_from_iterator(texts, trainer=trainer)
    
    # Save the trained tokenizer to the specified output path.
    tokenizer.save(output_path)
    print(f"Tokenizer successfully saved to {output_path}")

# Example usage:
csv_path = train_path  # Replace with your CSV file path
train_wordpiece_tokenizer_from_dataset(csv_path)




Tokenizer successfully saved to tokenizer.json


In [10]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the saved file.
# You can specify special tokens as needed.
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    unk_token="[UNK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    mask_token="[MASK]"
)

# Now you can use the tokenizer as usual.
sample_text = "Hello, how are you?"
encoded = tokenizer(sample_text)
print(encoded)

{'input_ids': [7582, 15, 701, 365, 279, 33], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [11]:
# Load the CSV file as a dataset using Hugging Face Datasets library.
# This creates a default "train" split.
dataset = load_dataset("csv", data_files=train_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 7613
    })
})

In [15]:
dataset['train'].to_pandas().head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Try again

In [17]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

In [18]:
%%time

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordPieceTrainer(
    vocab_size = 30_000,
    min_frequency=2,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

files = [train_path]
tokenizer.train(files, trainer)

tokenizer.save("tokenizer.json")




CPU times: user 936 ms, sys: 1.69 s, total: 2.62 s
Wall time: 539 ms
