#Natural language processing with deep learninh techniques, Project - A

## Imports

In [1]:
import numpy as np
import pandas as pd
import torch
import zipfile
from nltk.tokenize import TweetTokenizer
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

## Data Processing

In [2]:
def create_df(path):
  with zipfile.ZipFile(path, 'r') as z:
      z.extractall()
      print(z.namelist())

  with zipfile.ZipFile(path, 'r') as z:
      train_df = pd.read_csv(z.open("train.csv"))
      test_df  = pd.read_csv(z.open("validation.csv"))
  return train_df, test_df

In [3]:
# labels are integers, comments are stringa
def process_data(df):
  df_text = df["text"].astype(str).tolist()
  df_labels = df["label"].astype(int).tolist()
  return df_text, df_labels

In [4]:
path = 'data.zip'
train_df, test_df = create_df(path)
train_text, train_labels = process_data(train_df)
test_text, test_labels = process_data(test_df)

['validation.csv', 'train.csv']


### Tokenize

In [5]:
# found this tokenizer for twitter, need to see if we have to work with specific one
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
train_tokens = [tokenizer.tokenize(t) for t in train_text]
test_tokens = [tokenizer.tokenize(t) for t in test_text]

### Emedding

In [6]:
def token_id_mapping(tokens):
  counter = Counter()
  for tokens in train_tokens:
      counter.update(tokens)

  token_id = {"<pad>": 0, "<unk>": 1}

  for word, freq in counter.items():
      token_id[word] = len(token_id)
  token_id_size = len(token_id)
  id_token = {idx: word for word, idx in token_id.items()}

  return token_id, token_id_size, id_token

def tokens_to_ids(tokens, vocab, unk_token="<unk>"):
    unk_idx = vocab[unk_token]
    return [vocab.get(tok, unk_idx) for tok in tokens]

def ids_to_tokens(ids, token_id, id_token, pad_token="<pad>"):
    pad_idx = token_id[pad_token]
    return [id_token[idx] for idx in ids if idx != pad_idx]

In [7]:
def GloVe_embedding(vocab):

  # Glove embedding
  embedding_dim = 200 #should check more
  #glove_path = "glove.twitter.27B.100d.txt" #looks nice from kaggle, I will try the origin first

  #extract original GloVe
  glove_path = "glove.6B.200d.txt" #download the zip file and extract it manually cause it takes forever othewise
  embedding_matrix = np.random.uniform(
      -0.25, 0.25, (len(vocab), embedding_dim)
  ).astype("float32")

  print("Loading GloVe embeddings...")

  with open(glove_path, "r", encoding="utf8") as f:
      for line in f:
          values = line.strip().split()
          word = values[0]
          vector = np.asarray(values[1:], dtype="float32")

          if word in vocab:
              idx = vocab[word]
              embedding_matrix[idx] = vector

  print('Done')
  return embedding_matrix, embedding_dim

In [8]:
train_token_to_id, train_vocab_size, train_id_to_token = token_id_mapping(train_tokens)
test_token_to_id, test_vocab_size, test_id_to_token = token_id_mapping(test_tokens)
Gl_train_embedding_matrix, Gl_train_embedding_dim = GloVe_embedding(train_token_to_id)

Loading GloVe embeddings...
Done


### DataLoader

In [9]:
class LSTM_data_loader(Dataset):
    def __init__(self, tweets, labels):
        self.tweets = tweets
        self.labels = labels

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        seq = torch.tensor(self.tweets[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return seq, label


def collate_fn(batch):
    seqs, labels = zip(*batch)
    seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return seqs, labels

In [10]:
train_ids = [tokens_to_ids(t, train_token_to_id) for t in train_tokens]
test_ids = [tokens_to_ids(t, test_token_to_id) for t in test_tokens]

train_dataset = LSTM_data_loader(train_ids, train_labels)
test_dataset = LSTM_data_loader(test_ids, test_labels)

LSTM_train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

LSTM_test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)


## Model

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class LSTMClassifier(nn.Module):
    def __init__(self, token_id_size, embed_dim, hidden_dim, num_classes, embedding_matrix):
        super().__init__()

        self.embedding = nn.Embedding(token_id_size, embed_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x)
        logits = self.fc(h[-1])
        return logits


def compute_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for x, y in dataloader:
            logits = model(x)
            preds = torch.argmax(logits, dim=1)

            correct += (preds == y).sum().item()
            total += y.size(0)

    return f'Accuracy: {100 * correct / total}'


### Model Training

In [12]:
def train_LSTM(vocab_size, embed_dim, hidden_dim, embedding_matrix, num_epochs, path):
  model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, 6, embedding_matrix)
  loss_objective = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

  # --- Training Loop ---
  for epoch in range(num_epochs):
    model.train()
    loop = tqdm(LSTM_train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_x, batch_y in loop:
        model.train()
        optimizer.zero_grad()

        logits = model(batch_x)
        loss = loss_objective(logits, batch_y)
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

    acc = compute_accuracy(model, LSTM_test_loader)
    print(f"epoch {epoch+1} | loss {loss.item():.4f} | {acc}")
  torch.save(model.state_dict(), path)

In [13]:
def double_check(mapping_func, token_to_id, id_to_token, vocab_size, embedding_matrix, path):

  # Make sure the model is in evaluation mode
  model =LSTMClassifier(vocab_size, 200, 256, 6, embedding_matrix)
  model.load_state_dict(torch.load(path))
  model.eval()

  # Pick 10 examples
  num_examples = 20
  examples_printed = 0
  true, all = 0,0
  with torch.no_grad():
      for x, y in LSTM_test_loader:
          logits = model(x)
          preds = torch.argmax(logits, dim=1)

          for i in range(len(x)):
              # print("Tweet:", x[i])  # if x[i] is token ids, you may need to decode
              # print("True label:", y[i].item())
              # print("Predicted label:", preds[i].item())
              # print("---")
              tweet_ids = x[i].tolist()
              tweet_words = mapping_func(tweet_ids, token_to_id, id_to_token)
              print(" ".join(tweet_words))
              if y[i].item() == preds[i].item():
                  true += 1
              all += 1
              examples_printed += 1
              if examples_printed >= num_examples:
                  break
          if examples_printed >= num_examples:
              break
  print('Accracy is:', 100*true/all)

In [14]:
model_path = 'Gl_lstm_model.pth'
embed_dim, hidden_dim, num_epochs = Gl_train_embedding_dim, 256, 10
train_LSTM(train_vocab_size, embed_dim, hidden_dim, Gl_train_embedding_matrix, num_epochs, model_path)
double_check(ids_to_tokens, test_token_to_id, test_id_to_token, train_vocab_size, Gl_train_embedding_matrix, model_path)

Epoch 1/10: 100%|██████████| 500/500 [01:05<00:00,  7.68it/s, loss=1.49]


epoch 1 | loss 1.4912 | Accuracy: 34.85


Epoch 2/10: 100%|██████████| 500/500 [01:04<00:00,  7.79it/s, loss=1.78]


epoch 2 | loss 1.7754 | Accuracy: 35.15


Epoch 3/10: 100%|██████████| 500/500 [01:06<00:00,  7.55it/s, loss=1.78]


epoch 3 | loss 1.7760 | Accuracy: 35.1


Epoch 4/10: 100%|██████████| 500/500 [01:04<00:00,  7.79it/s, loss=1.55]


epoch 4 | loss 1.5500 | Accuracy: 35.1


Epoch 5/10: 100%|██████████| 500/500 [01:03<00:00,  7.85it/s, loss=1.63]


epoch 5 | loss 1.6264 | Accuracy: 34.6


Epoch 6/10: 100%|██████████| 500/500 [01:02<00:00,  7.98it/s, loss=1.48]


epoch 6 | loss 1.4775 | Accuracy: 44.0


Epoch 7/10: 100%|██████████| 500/500 [01:02<00:00,  7.94it/s, loss=1.11]


epoch 7 | loss 1.1110 | Accuracy: 58.05


Epoch 8/10: 100%|██████████| 500/500 [01:04<00:00,  7.79it/s, loss=0.848]


epoch 8 | loss 0.8481 | Accuracy: 71.2


Epoch 9/10: 100%|██████████| 500/500 [01:07<00:00,  7.45it/s, loss=0.167]


epoch 9 | loss 0.1667 | Accuracy: 86.5


Epoch 10/10: 100%|██████████| 500/500 [01:02<00:00,  7.98it/s, loss=0.45]


epoch 10 | loss 0.4496 | Accuracy: 89.2
im feeling quite sad and sorry for myself but ill snap out of it soon
i feel like i am still looking at a blank canvas blank pieces of paper
i feel like a faithful <unk>
i am just feeling cranky and blue
i can have for a treat or if i am feeling festive
i start to feel more appreciative of what god has done for me
i am feeling more confident that we will be able to take care of this baby
i feel incredibly lucky just to be able to talk to her
i feel less keen about the army every day
i feel dirty and ashamed for saying that
i feel bitchy but not defeated yet
i was dribbling on mums coffee table looking out of the window and feeling very happy
i woke up often got up around am feeling <unk> radiation and groggy
i was feeling sentimental
i walked out of there an hour and fifteen minutes later feeling like i had been beaten with a stick and then placed on the rack and stretched
i never stop feeling thankful as to compare with others i considered mysel

## To Do

1. Implement a GRU model.

2. Experiment with hyperparameters and consider plotting a graph for epochs (to potentially find the "optimal" point).