# Task 4
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, DistilBertTokenizer


  from .autonotebook import tqdm as notebook_tqdm


Depending on your approach, you might need to adapt the structure of this template or parts not marked by TODOs.
It is not necessary to completely follow this template. Feel free to add more code and delete any parts that are not required.

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16
NUM_EPOCHS = 5
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")


#rename columns from sentence to text and from score to label
train_val.rename(columns={"sentence": "text", "score": "label"}, inplace=True)
test_val.rename(columns={"sentence": "text"}, inplace=True)

print(train_val["label"].value_counts())

label
0    6404
1    6096
Name: count, dtype: int64


In [5]:
# TODO: Fill out ReviewDataset
class ReviewDataset(Dataset):
    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.texts = data_frame["text"].tolist()
        self.labels = data_frame["label"].tolist() if "label" in data_frame.columns else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        inputs = self.tokenizer(text, padding="max_length", truncation=True, max_length=180, return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[index], dtype=torch.float)
        return item

In [6]:
train_dataset = ReviewDataset(train_val)
test_dataset = ReviewDataset(test_val)

print(train_dataset[0])  # Example to check the dataset structure

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)
# Additional code if needed

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

{'input_ids': tensor([  101,  2023,  2028,  2003,  1037,  2978,  4326,  1012,  2006,  2028,
         2192,  1045,  2228,  2009,  2003,  1037,  3376,  1998,  8235,  1010,
        13459,  2201,  1012,  2006,  1996,  2060,  2192,  1045,  2228,  2008,
         2002,  2439,  1037,  2978,  1997,  2010,  2373,  1012,  2054,  1045,
         3335,  2182,  2024,  2047,  2774,  2044,  2061,  2116,  2086,  2302,
         2725,  2009,  3948,  1010,  1998,  1996, 10424,  5657,  2543,  2002,
         2109,  2000,  2031,  1010,  2009,  2003,  2036,  6517,  2000,  5060,
         2008,  2010,  2376,  1010,  1996,  2376,  1997,  2600,  1010,  2003,
         2025,  2054,  2009,  2109,  2000,  2022,  1012,  2021,  1010,  2064,
         1005,  1056,  3524,  2005,  1996,  2279,  2201,  1010,  2004,  2023,
         2051,  2002,  2404,  1037,  2613,  2100,  2204,  2316,  2362,  1012,
        13718,  4394,  2069,  2003,  2745,  3389,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [7]:
# TODO: Fill out MyModule
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("distilbert-base-uncased")
        for param in self.bert.parameters():
            param.requires_grad = False
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)  # Output a single logit
        )


    def forward(self, x):
        outputs = self.bert(input_ids=x["input_ids"], attention_mask=x["attention_mask"])
        hidden_state = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(hidden_state)
        output = logits.squeeze(-1)
        # print(output)
        return torch.sigmoid(output)  # shape: (batch,)


model = MyModule().to(DEVICE)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# TODO: Setup loss function, optimiser, and scheduler
criterion = nn.CrossEntropyLoss()
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
optimiser = torch.optim.AdamW(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimiser, step_size=1, gamma=0.9)

model.train()
for epoch in range(NUM_EPOCHS):
    model.train()
    mean_loss = 0.0
    for batch in tqdm(train_loader, total=len(train_loader)):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(batch)
        labels = batch["labels"].to(DEVICE)
        loss = criterion(outputs, labels)
        mean_loss += loss.item() / len(train_loader)
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        scheduler.step()
    print(f"Mean loss in epoch {epoch}: {mean_loss}")





262913


100%|██████████| 782/782 [01:22<00:00,  9.50it/s]


Mean loss in epoch 0: 20.878525592481985


100%|██████████| 782/782 [01:13<00:00, 10.65it/s]


Mean loss in epoch 1: 20.83263197061044


 17%|█▋        | 131/782 [00:12<01:00, 10.74it/s]

In [11]:
model.eval()
with torch.no_grad():
    results = []
    for batch in tqdm(test_loader, total=len(test_loader)):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        logits = model(batch)
        predictions = (logits > 0.5).long()
        results.append(predictions.cpu().numpy())

    with open("result.txt", "w") as f:
        for val in np.concatenate(results):
            f.write(f"{val}\n")

100%|██████████| 42/42 [00:06<00:00,  6.01it/s]
