# NLP with Disaster Tweets Kaggle
1. Set up the experimentation framework
    - Subset the train data into train/val/test
    - Optimize the model on the train dataset, using val to evaluate performance per epoch
    - Evaluate performance on test
    - If performance is good enough submit that to kaggle and then see what the resulting output is
    - I should be able to track the performance of a given set of hyperparameters or design decisions through the whole process. I.e. I'll get a train, val, test performance numbers, then I'll retrain it on the whole dataset using that approach, then I'll submit that to kaggle and evaluate the leaderboard performance for that submission and add it to the experiment tracker.
2. Ok, that's all great. Let's see if I can structure my code in such a way that I can move the sort of repeatable, reuseable part of my code to one portion, and then have the custom code for reading in and preprocessing the data in a different place. That way I could theoretically swap out the preprocessing code and keep the model trainin code if I wanted.


In [41]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os

import torch

In [2]:
cfg = OmegaConf.create({
    
})

In [4]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'
sample_submission_path = '../data/sample_submission.csv'

In [None]:
df_train = pl.read_csv(train_path)
df_test = pl.read_csv(test_path)

In [11]:
df_train.sample(5)

id,keyword,location,text,target
i64,str,str,str,i64
6310,"""hostage""",,"""Related News: 'ISIS video' thr…",1
10020,"""twister""",,"""Brain twister let drop up tell…",0
617,"""arsonist""","""ss""","""@58hif my trick is to think ab…",0
7816,"""quarantine""",,"""Reddit Will Now Quarantine Off…",1
7578,"""outbreak""","""The Netherlands""","""Families to sue over Legionnai…",1


In [40]:
df_test.sample(5)

id,keyword,location,text
i64,str,str,str
6489,"""injuries""",,"""4 Common Running Injuries and …"
901,"""bioterrorism""","""The Forbidden Forest""","""@BishoyRagheb fair. Bioterrori…"
8984,"""storm""","""North Coast of O-H-I-O""","""Storm rolling into Hilton Head…"
5053,"""eyewitness""","""West Virginia""","""UPDATE: A GOP-controlled Senat…"
5250,"""fatality""","""Planet Earth (mainly) #Neuland""",""". @paulrogers002 Many #cancers…"


In [10]:
df_train.null_count()

id,keyword,location,text,target
u32,u32,u32,u32,u32
0,61,2533,0,0


In [9]:
df_train?

[0;31mType:[0m        DataFrame
[0;31mString form:[0m
shape: (7_613, 5)
           ┌───────┬─────────┬──────────┬─────────────────────────────────┬────────┐
           │ id  <...> mes Razed b… ┆ 1      │
           └───────┴─────────┴──────────┴─────────────────────────────────┴────────┘
[0;31mLength:[0m      7613
[0;31mFile:[0m        ~/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/polars/dataframe/frame.py
[0;31mDocstring:[0m  
Two-dimensional data structure representing data as a table with rows and columns.

Parameters
----------
data : dict, Sequence, ndarray, Series, or pandas.DataFrame
    Two-dimensional data in various forms; dict input must contain Sequences,
    Generators, or a `range`. Sequence may contain Series or other Sequences.
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
    The schema of the resulting DataFrame. The schema may be declared in several
    ways:

    * As a dict of {name:type} pairs; if t

## Set up the dataloader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

import polars as pl
import torch

In [52]:
torch.manual_seed(42)

<torch._C.Generator at 0x120ddf450>

In [None]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (string): Path to the CSV file with annotations.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = pl.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Adjust column names if different from "text" and "target"
        tweet = self.data['text'][idx]
        label = self.data['target'][idx]

        sample = {'tweet': tweet, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [46]:
dataset = DisasterTweetsDataset(csv_file=train_path)
print(dataset[0])

{'tweet': 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'label': 1}


In [49]:
# Define train-test split ratio (e.g., 80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset into training and testing subsets
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders for the training and testing datasets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [50]:
# Example iteration through the train loader
for batch in train_loader:
    print(batch)
    break

{'tweet': ["US wont upgrade its infrastructure? http://t.co/NGEHhG9YGa' it a bad situation and its going to get ugly very quickly #USA #sustainability", 'watching it go up in flames', 'Gunmen kill four in El Salvador bus attack: Suspected Salvadoran gang members killed four people and wounded s... http://t.co/CNtwB6ScZj', "Too dangerous for them. But it's OK for the rest of us to be in danger. https://t.co/YL67DKf4tb", 'Expect gusty winds heavy downpours and lightning moving northeast toward VA now. http://t.co/Z5cfrWado6', 'eggs desolate', 'The once desolate valley was transformed into a thriving hub of hi\x89ÛÓtech business.', 'To All The Meat-Loving Feminists Of The World Riot Grill Has Arrived: Pop quiz! Which do you prefer: feminist... http://t.co/HXOX7o42Rq', 'Amazon Prime Day: 12 quick takeaways from Amazon\x89Ûªs magnificent train wreck - http://t.co/DBDwtOcGXF', '70 Years After Atomic Bombs Japan Still Struggles With War Past: The anniversary of the devastation wrought b... ht

In [53]:
import torch
import torch.nn as nn

class AvgEmbeddingClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(AvgEmbeddingClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        # x: (batch_size, seq_len)
        embeds = self.embedding(x)          # (batch_size, seq_len, embed_dim)
        pooled = embeds.mean(dim=1)         # (batch_size, embed_dim)
        output = self.fc(pooled)            # (batch_size, num_classes)
        return output

In [None]:
import torch.optim as optim


In [57]:
# Hyperparameters
vocab_size = 5000      # Size of your vocabulary
embed_dim = 128        # Embedding dimension
num_classes = 2        # Number of output classes
seq_len = 50           # Length of each text sequence
batch_size = 32
num_epochs = 5
learning_rate = 0.001

model = AvgEmbeddingClassifier(vocab_size, embed_dim, num_classes)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [60]:
# Simple tokenizer: lowercase and split by whitespace
def tokenize(text):
    return text.lower().split()

In [59]:
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()           # Zero the gradients
        outputs = model(inputs)         # Forward pass
        loss = loss_function(outputs, labels)  # Compute loss
        loss.backward()                 # Backpropagation
        optimizer.step()                # Update parameters
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("Training complete!")    

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str

In [61]:
from datasets import load_dataset

In [62]:
load_dataset?

[0;31mSignature:[0m
[0mload_dataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_dir[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_files[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mdatasets[0m[0;34m.[0m[0msplits[0m[0;34m

# Huggingface implementation
1. Read in the data into a dataloader or huggingface dataset or whatever
2. tokenize the input data
3. pass the tokenized data to the model
4. 

In [86]:
# %% [code]
import os
import json
import random
import numpy as np
import torch
import torch.nn as nn

from transformers import (
    Trainer,
    TrainingArguments,
    PreTrainedModel,
    PretrainedConfig,
)
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score
from collections import Counter

# ===========
# Parameters
# ===========
# Update these file paths as needed.
TRAIN_FILE = "../data/train.csv"           # CSV with columns "text" and "label"
VAL_FILE = None # "../data/test.csv"        # Optional; if not available, a split will be made

# Tokenizer / data parameters
TEXT_COLUMN = "text"
LABEL_COLUMN = "target"
MAX_LENGTH = 128     # Maximum number of tokens per example

# Model hyperparameters
EMBEDDING_DIM = 128
NUM_LABELS = 2       # Change if you have more classes

# Training hyperparameters
NUM_EPOCHS = 3
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 5e-4
OUTPUT_DIR = "./results"

# ==========================
# Build a simple vocabulary
# ==========================
def build_vocab(texts, min_freq: int = 1):
    """
    Build a vocabulary dictionary from a list of texts.
    Special tokens:
      - "[PAD]": used for padding (id: 0)
      - "[UNK]": used for unknown tokens (id: 1)
    All tokens with frequency >= min_freq are added.
    """
    counter = Counter()
    for text in texts:
        tokens = text.split()
        counter.update(tokens)
    # Initialize with special tokens.
    vocab = {"[PAD]": 0, "[UNK]": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

# ======================
# Tokenization function
# ======================
def tokenize_function(example):
    """
    A very simple tokenizer that splits on whitespace and then maps tokens
    to their corresponding IDs using the global `vocab` dictionary.
    It also pads/truncates the sequence to MAX_LENGTH and creates an attention mask.
    """
    tokens = example[TEXT_COLUMN].split()
    # Convert tokens to ids (using [UNK] for tokens not in vocab)
    token_ids = [vocab.get(token, vocab["[UNK]"]) for token in tokens]
    # Truncate if necessary
    if len(token_ids) > MAX_LENGTH:
        token_ids = token_ids[:MAX_LENGTH]
        attention_mask = [1] * MAX_LENGTH
    else:
        attention_mask = [1] * len(token_ids) + [0] * (MAX_LENGTH - len(token_ids))
        token_ids = token_ids + [vocab["[PAD]"]] * (MAX_LENGTH - len(token_ids))
    return {"input_ids": token_ids, "attention_mask": attention_mask}

# ==========================================
# Define a simple CBOW classification model
# ==========================================
class CBOWConfig(PretrainedConfig):
    model_type = "cbow"
    def __init__(self, vocab_size=30522, hidden_size=128, num_labels=2, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_labels = num_labels

class CBOWForSequenceClassification(PreTrainedModel):
    config_class = CBOWConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, labels=None):
        """
        Forward pass:
          - Embed the input tokens.
          - Compute the average over the sequence.
          - Apply dropout and classify.
        """
        embedded = self.embedding(input_ids)  # [batch_size, seq_length, hidden_size]
        pooled = embedded.mean(dim=1)         # [batch_size, hidden_size]
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)        # [batch_size, num_labels]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# ======================
# Compute evaluation metrics
# ======================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# ============
# Main function
# ============
def main():
    # For reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)

    # ------------------------------
    # 1. Load the dataset
    # ------------------------------
    if False: # os.path.exists(VAL_FILE):
        # If separate train and validation files exist, load them.
        data_files = {"train": TRAIN_FILE, "validation": VAL_FILE}
        dataset = load_dataset("csv", data_files=data_files)
    else:
        # Otherwise, load the training CSV and perform a random split (e.g., 90/10 split).
        raw_dataset = load_dataset("csv", data_files=TRAIN_FILE)
        split_dataset = raw_dataset["train"].train_test_split(test_size=0.1, seed=42)
        dataset = DatasetDict({"train": split_dataset["train"], "validation": split_dataset["test"]})

    # ------------------------------
    # 2. Build the tokenizer vocabulary from the training set.
    # ------------------------------
    print("Building vocabulary from training data...")
    texts = dataset["train"][TEXT_COLUMN]
    global vocab  # Make vocab global so that tokenize_function() can access it.
    vocab = build_vocab(texts, min_freq=1)
    print(f"Vocabulary size: {len(vocab)}")

    # Optionally, save the vocabulary for later use.
    with open("vocab.json", "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)

    # ------------------------------
    # 3. Tokenize the datasets.
    # ------------------------------
    print("Tokenizing dataset...")
    tokenized_datasets = dataset.map(tokenize_function, batched=False)
    tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", LABEL_COLUMN])

    # ------------------------------
    # 4. Initialize the model from scratch.
    # ------------------------------
    config = CBOWConfig(
        vocab_size=len(vocab),
        hidden_size=EMBEDDING_DIM,
        num_labels=NUM_LABELS,
    )
    model = CBOWForSequenceClassification(config)

    # ------------------------------
    # 5. Setup TrainingArguments and Trainer.
    # ------------------------------
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        compute_metrics=compute_metrics,
    )

    # ------------------------------
    # 6. Train the model!
    # ------------------------------
    trainer.train()

    # Save the model and vocabulary.
    trainer.save_model(OUTPUT_DIR)
    print("Training complete. Model and vocabulary saved.")

# Call main() to run the training process.
main()

Building vocabulary from training data...
Vocabulary size: 29596
Tokenizing dataset...


Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]



ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

# Old/Misc

## Getting started tutorial they had

In [15]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [16]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [17]:
train_df[train_df["target"] == 0]["text"].values[1]


'I love fruits'

In [18]:
train_df[train_df["target"] == 1]["text"].values[1]


'Forest fire near La Ronge Sask. Canada'

In [19]:
count_vectorizer = feature_extraction.text.CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])

In [20]:
print(example_train_vectors[0].todense().shape)

(1, 54)


In [21]:
type(example_train_vectors)

scipy.sparse._csr.csr_matrix

In [22]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])
test_vectors = count_vectorizer.transform(test_df['text'])

In [23]:
clf = linear_model.RidgeClassifier()

In [24]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df['target'], cv=3, scoring='f1')

In [25]:
scores

array([0.59453669, 0.5642787 , 0.64082434])

In [26]:
clf.fit(train_vectors, train_df['target'])

In [27]:
results = clf.predict(test_vectors)

In [29]:
sample_submission = pd.read_csv(sample_submission_path)

In [31]:
sample_submission['target'] = results

In [32]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [34]:
sample_submission.to_csv('../data/submissions/test_submission.csv', index=False)

In [35]:
train_vectors.shape

(7613, 21637)

In [36]:
test_vectors.shape

(3263, 21637)