In [None]:
!pip install torch

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/d0/5f/f41b14a398d484bf218d5167ec9061c1e76f500d9e25166117818c8bacda/torch-2.3.1-cp311-none-macosx_11_0_arm64.whl.metadata
  Downloading torch-2.3.1-cp311-none-macosx_11_0_arm64.whl.metadata (26 kB)
Downloading torch-2.3.1-cp311-none-macosx_11_0_arm64.whl (61.0 MB)
[2K   [38;2;249;38;114m━━━━━━━━━━━[0m[38;5;237m╺[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/61.0 MB[0m [31m875.2 kB/s[0m eta [36m0:00:50[0m:51[0m

In [2]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import torch.nn.functional as F

seed = 1234
device = "cuda"
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

  from pandas.core import (


ModuleNotFoundError: No module named 'torch'

In [None]:
dataset = "mr"

if dataset == "mr":
  train_data, valid_data, test_data = datasets.load_dataset("jeffnyman/rotten_tomatoes_reviews", split=["train", "validation", "test"])

if dataset == "subj":
  train_data, test_data = datasets.load_dataset("SetFit/subj", split=["train", "test"])
  train_valid_data = train_data.train_test_split(test_size=0.1)
  train_data = train_valid_data["train"]
  valid_data = train_valid_data["test"]

tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    return {"tokens": tokens}

max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
valid_data = valid_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [None]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(unk_index)

def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

train_data = train_data.with_format(type="torch", columns=["ids", "label"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label"])
test_data = test_data.with_format(type="torch", columns=["ids", "label"])

In [3]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "label": batch_label}
        return batch

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
  collate_fn = get_collate_fn(pad_index)
  data_loader = torch.utils.data.DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      collate_fn=collate_fn,
      shuffle=shuffle,
  )
  return data_loader


In [None]:
class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim, label_size)

    def forward(self, x):
        # batch_size = x.shape[0]
        embedded = self.embedding(x)
        output, (ht, _) = self.lstm(embedded)
        return self.fc(ht[-1])


In [None]:
vocab_size = len(vocab)
embedding_dim = 300
n_filters = 100
filter_sizes = [3, 4, 5]
batch_size = 50
LR = 1

output_dim = len(train_data.unique("label"))

model = LSTM(vocab_size, embedding_dim, 128, 4).to(device)


# model = CNN_Text(vocab_size, embedding_dim)

def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv1d):
        nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
        nn.init.zeros_(m.bias)

model.apply(initialize_weights)

optimizer = torch.optim.Adadelta(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

criterion = nn.CrossEntropyLoss()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

model = model.to(device)
criterion = criterion.to(device)

The model has 1,961,700 trainable parameters


In [None]:
def train(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return np.mean(epoch_losses), np.mean(epoch_accs)

def evaluate(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
n_epochs = 50
best_valid_acc = 0

metrics = collections.defaultdict(list)

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)
    scheduler.step()
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "cnn.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...: 100%|██████████| 171/171 [00:00<00:00, 193.37it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 295.45it/s]


epoch: 0
train_loss: 0.716, train_acc: 0.533
valid_loss: 0.671, valid_acc: 0.596


training...: 100%|██████████| 171/171 [00:00<00:00, 195.17it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 300.07it/s]


epoch: 1
train_loss: 0.655, train_acc: 0.614
valid_loss: 0.647, valid_acc: 0.621


training...: 100%|██████████| 171/171 [00:00<00:00, 190.18it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 296.03it/s]


epoch: 2
train_loss: 0.583, train_acc: 0.691
valid_loss: 0.627, valid_acc: 0.648


training...: 100%|██████████| 171/171 [00:00<00:00, 189.99it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 298.65it/s]


epoch: 3
train_loss: 0.502, train_acc: 0.760
valid_loss: 0.641, valid_acc: 0.671


training...: 100%|██████████| 171/171 [00:00<00:00, 190.26it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.23it/s]


epoch: 4
train_loss: 0.409, train_acc: 0.816
valid_loss: 0.621, valid_acc: 0.699


training...: 100%|██████████| 171/171 [00:00<00:00, 195.65it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.71it/s]


epoch: 5
train_loss: 0.311, train_acc: 0.867
valid_loss: 0.637, valid_acc: 0.708


training...: 100%|██████████| 171/171 [00:00<00:00, 194.32it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 298.63it/s]


epoch: 6
train_loss: 0.219, train_acc: 0.910
valid_loss: 0.820, valid_acc: 0.690


training...: 100%|██████████| 171/171 [00:00<00:00, 195.08it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.10it/s]


epoch: 7
train_loss: 0.128, train_acc: 0.951
valid_loss: 1.035, valid_acc: 0.694


training...: 100%|██████████| 171/171 [00:00<00:00, 196.08it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 300.52it/s]


epoch: 8
train_loss: 0.071, train_acc: 0.974
valid_loss: 1.410, valid_acc: 0.676


training...: 100%|██████████| 171/171 [00:00<00:00, 194.15it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.73it/s]


epoch: 9
train_loss: 0.034, train_acc: 0.989
valid_loss: 1.448, valid_acc: 0.688


training...: 100%|██████████| 171/171 [00:00<00:00, 190.57it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 271.45it/s]


epoch: 10
train_loss: 0.015, train_acc: 0.995
valid_loss: 1.693, valid_acc: 0.707


training...: 100%|██████████| 171/171 [00:00<00:00, 193.49it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 291.64it/s]


epoch: 11
train_loss: 0.007, train_acc: 0.998
valid_loss: 2.029, valid_acc: 0.687


training...: 100%|██████████| 171/171 [00:00<00:00, 190.36it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.11it/s]


epoch: 12
train_loss: 0.004, train_acc: 0.999
valid_loss: 1.954, valid_acc: 0.694


training...: 100%|██████████| 171/171 [00:00<00:00, 188.66it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.28it/s]


epoch: 13
train_loss: 0.003, train_acc: 0.999
valid_loss: 2.026, valid_acc: 0.692


training...: 100%|██████████| 171/171 [00:00<00:00, 191.74it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 294.65it/s]


epoch: 14
train_loss: 0.002, train_acc: 0.999
valid_loss: 2.066, valid_acc: 0.690


training...: 100%|██████████| 171/171 [00:00<00:00, 194.08it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.55it/s]


epoch: 15
train_loss: 0.002, train_acc: 1.000
valid_loss: 2.118, valid_acc: 0.686


training...: 100%|██████████| 171/171 [00:00<00:00, 196.56it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 292.16it/s]


epoch: 16
train_loss: 0.002, train_acc: 0.999
valid_loss: 2.122, valid_acc: 0.690


training...: 100%|██████████| 171/171 [00:00<00:00, 194.52it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.03it/s]


epoch: 17
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.199, valid_acc: 0.688


training...: 100%|██████████| 171/171 [00:00<00:00, 196.72it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.02it/s]


epoch: 18
train_loss: 0.002, train_acc: 0.999
valid_loss: 2.180, valid_acc: 0.688


training...: 100%|██████████| 171/171 [00:00<00:00, 197.83it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.28it/s]


epoch: 19
train_loss: 0.002, train_acc: 1.000
valid_loss: 2.192, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 197.21it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 304.57it/s]


epoch: 20
train_loss: 0.001, train_acc: 0.999
valid_loss: 2.201, valid_acc: 0.698


training...: 100%|██████████| 171/171 [00:00<00:00, 195.48it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.30it/s]


epoch: 21
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.219, valid_acc: 0.692


training...: 100%|██████████| 171/171 [00:00<00:00, 195.85it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 296.17it/s]


epoch: 22
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.222, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 194.76it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.27it/s]


epoch: 23
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.230, valid_acc: 0.696


training...: 100%|██████████| 171/171 [00:00<00:00, 195.39it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.83it/s]


epoch: 24
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.249, valid_acc: 0.692


training...: 100%|██████████| 171/171 [00:00<00:00, 190.85it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.01it/s]


epoch: 25
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.253, valid_acc: 0.697


training...: 100%|██████████| 171/171 [00:00<00:00, 191.52it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.79it/s]


epoch: 26
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.266, valid_acc: 0.691


training...: 100%|██████████| 171/171 [00:00<00:00, 194.92it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.42it/s]


epoch: 27
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.268, valid_acc: 0.691


training...: 100%|██████████| 171/171 [00:00<00:00, 195.38it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.97it/s]


epoch: 28
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.272, valid_acc: 0.695


training...: 100%|██████████| 171/171 [00:00<00:00, 195.82it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.03it/s]


epoch: 29
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.280, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.19it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 300.65it/s]


epoch: 30
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.285, valid_acc: 0.695


training...: 100%|██████████| 171/171 [00:00<00:00, 196.16it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 300.76it/s]


epoch: 31
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.290, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 194.63it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 300.98it/s]


epoch: 32
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.294, valid_acc: 0.694


training...: 100%|██████████| 171/171 [00:00<00:00, 196.49it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.13it/s]


epoch: 33
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.298, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.32it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.51it/s]


epoch: 34
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.302, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 195.24it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 302.99it/s]


epoch: 35
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.305, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.68it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.02it/s]


epoch: 36
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.308, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 195.62it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 291.57it/s]


epoch: 37
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.310, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 192.94it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.64it/s]


epoch: 38
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.311, valid_acc: 0.692


training...: 100%|██████████| 171/171 [00:00<00:00, 194.30it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 298.84it/s]


epoch: 39
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.314, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 194.28it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.99it/s]


epoch: 40
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.315, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 195.47it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 298.25it/s]


epoch: 41
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.317, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 195.24it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 293.70it/s]


epoch: 42
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.319, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 197.64it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 299.45it/s]


epoch: 43
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.320, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.40it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 295.01it/s]


epoch: 44
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.321, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 194.05it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 297.08it/s]


epoch: 45
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.322, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.61it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 302.02it/s]


epoch: 46
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.323, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 194.20it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.34it/s]


epoch: 47
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.324, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 196.61it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 292.80it/s]


epoch: 48
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.325, valid_acc: 0.693


training...: 100%|██████████| 171/171 [00:00<00:00, 193.28it/s]
evaluating...: 100%|██████████| 22/22 [00:00<00:00, 301.86it/s]

epoch: 49
train_loss: 0.001, train_acc: 1.000
valid_loss: 2.325, valid_acc: 0.693





In [None]:
model.load_state_dict(torch.load("cnn.pt"))

test_loss, test_acc = evaluate(test_data_loader, model, criterion, device)

print(f"test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}")

evaluating...: 100%|██████████| 22/22 [00:00<00:00, 302.54it/s]

test_loss: 1.932, test_acc: 0.715





We can observe that the LSTM model has overfitted. By looking at the train_acc and val_acc, we can confirm this. As valid_acc approaches 70.8%, train_acc approached 100%, which is a clear sign of overfitting. Hence, we need to solve this issue of overfitting.

In order to solve thie issue of overfitting, one of the most effective ways is to add dropout layers. By adding dropout layers, we can reduce overfittng, hence increasing accuracy.

In [None]:
class LSTM_dropout(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, label_size)

    def forward(self, x):
        # batch_size = x.shape[0]
        embedded = self.embedding(x)
        output, (ht, _) = self.lstm(embedded)
        print(ht)
        return self.fc(ht[-1])


Next, we move on to explore another proposed idea - using a pre trained transformer LLM.

In [None]:
import transformers
import torch

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline
from tqdm import tqdm
corr, wrong = 0, 0
for i in tqdm(range(0, len(valid_data), 3)):
  messages = [
      {"role": "system", "content": "You are a helper that checks if a message is positive or negative."},
      {"role": "user", "content": valid_data['text'][i]},
  ]

  pipe = pipeline(
      "text-generation",
      model=model,
      tokenizer=tokenizer,
  )

  generation_args = {
      "max_new_tokens": 1,
      "return_full_text": False,
      "temperature": 0.0,
      "do_sample": False,
  }

  output = pipe(messages, **generation_args)

  if str(output[0]['generated_text']) in str(valid_data['label'][i]):
    corr += 1
  else:
    wrong += 1

100%|██████████| 356/356 [07:07<00:00,  1.20s/it]


In [None]:
print(corr, wrong)
print(f"Accuracy: {corr/(corr+wrong)}")

323 33
Accuracy: 0.9073033707865169


We can see that by limiting the number of token output to 1, and setting the temperature to 0, we made the model more deterministic.


By looking at the accuracy given by the model, we can observe that it is the best model by far. With a shocking **90.73%** accuracy given by the model, this is leagues above any other model we have implemented so far. It is no surprise, since it has 3.82 billion parameters.

However, the huge size computational power required for the model may make it impractical for trivial tasks for companies that do not have their own gpu(s). We can attempt to fine tune the LLM model to further improve accuracy.

However, due to the severe lack of computational power, we will not be fine tuning our own model today.

Another interesting technique we can try out is to combine the CNN model with another RNN layer. We can do this by replacing the linear layer with LSTM layers instead.

Proposed architecture:

In [None]:
class CNN_LSTM_Text(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CNN_LSTM_Text, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv1 = nn.Conv2d(1, 100, (3, embedding_dim), padding=0) # original was 1, 100
        self.conv2 = nn.Conv2d(1, 100, (4, embedding_dim), padding=0)
        self.conv3 = nn.Conv2d(1, 100, (5, embedding_dim), padding=0)

        self.lstm = nn.LSTM(100, 100, num_layers=1, batch_first=True)

        self.dropout = nn.Dropout(0.5)

        self.fc = nn.Linear(100, 4)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        output_conv_1 = F.relu(self.conv1(embedded))
        output_conv_1 = output_conv_1.squeeze(3)
        output_conv_2 = F.relu(self.conv2(embedded)).squeeze(3)
        output_conv_3 = F.relu(self.conv3(embedded)).squeeze(3)
        output_maxpool_1 = F.max_pool1d(output_conv_1, output_conv_1.size(2)).squeeze(2).unsqueeze(1)
        output_maxpool_2 = F.max_pool1d(output_conv_2, output_conv_2.size(2)).squeeze(2).unsqueeze(1)
        output_maxpool_3 = F.max_pool1d(output_conv_3, output_conv_3.size(2)).squeeze(2).unsqueeze(1)
        output_maxpooled = torch.cat(
            (output_maxpool_1, output_maxpool_2, output_maxpool_3), dim=1
        )
        _, (h_n, _) = self.lstm(output_maxpooled)
        lstm_output = h_n[-1]
        drop_outed = self.dropout(lstm_output)
        final = self.fc(drop_outed)
        return F.softmax(final)
