In [9]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from tqdm.auto import tqdm

import config as cfg
from src.tokenizer import WordTokenizer
from src.dataset import TextClassificationDataset
from src.utils import normalizeString, train, evaluate
from src.model import TextClassificationModel

## Load dataset

### Disaster Tweets

In [10]:
train_df = pd.read_csv("./dataset/disaster-twitter/train.csv")
# test_df = pd.read_csv("./dataset/disaster-twitter/test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
corpus = [normalizeString(text) for text in train_df.text.values]
target = train_df.target.values

X_train, X_val, y_train, y_val = train_test_split(
    corpus, target, test_size=0.2, random_state=42
)

In [12]:
print("Some examples of training data:")
idx = random.randint(0, len(X_train))
print(f"X_train[{idx}]:", X_train[idx])
print(f"y_train[{idx}]:", y_train[idx])

Some examples of training data:
X_train[1496]: falling asleep to the sounds to thousands of river plate fans in the stadium and a thunderstorm vivaargentina
y_train[1496]: 0


### ABCDEF

## Tokenizer

In [13]:
tokenizer = WordTokenizer(cfg.VOCAB_SIZE, cfg.MAX_SEQ_LENGTH)

tokenizer.add_corpus(corpus)

In [14]:
example = "I am a student"
example = normalizeString(example)

print("Example:", example)
ids, mask = tokenizer.encode(example, get_mask=True)
print("Encoded:", ids)
print("Decoded:", tokenizer.decode(ids))

Example: i am a student
Encoded: tensor([  86,  120,   56, 2290,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
Decoded: ['i', 'am', 'a', 'student']


## Model

In [15]:
model = TextClassificationModel(
    vocab_size=len(tokenizer.word2index),
    num_classes=2,
    max_seq_len=cfg.MAX_SEQ_LENGTH,
    embedding_dim=cfg.EMBEDDING_DIM,
    n_heads=cfg.N_HEADS,
    n_layers=cfg.N_LAYERS,
    ff_dim=cfg.FF_DIM,
    drop_out=cfg.DROP_OUT,
)

## Training

In [16]:
train_dataset = TextClassificationDataset(
    X_train, y_train, tokenizer, cfg.MAX_SEQ_LENGTH
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=cfg.BATCH_SIZE, shuffle=True
)

test_dataset = TextClassificationDataset(X_val, y_val, tokenizer, cfg.MAX_SEQ_LENGTH)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=cfg.BATCH_SIZE, shuffle=False
)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg.LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(cfg.NUM_EPOCHS):
    print(f"Epoch {epoch + 1}/{cfg.NUM_EPOCHS}")
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    evaluate_loss, evaluate_acc = evaluate(
        model, test_loader, criterion, device, verbose=True
    )

Epoch 1/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 2/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 3/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 4/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 5/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 6/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 7/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 8/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 9/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch 10/10


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

In [17]:
torch.save(model.state_dict(), "./model/model.pt")