In [1]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import psutil
import torch
import os
import spacy

from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import psutil

# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")


Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 31.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 27.0 °C


In [2]:
# # USE ONLY TO EXTRACT FILES FROM TAR FILES

# import tarfile

# def extract_all_files(tar_file_path, extract_to):
#     with tarfile.open(tar_file_path, 'r') as tar:
#         tar.extractall(extract_to)

# # Example usage
# tar_file_path = 'datasets/qrels.trec8.qa.gz'
# extract_to = 'datasets/'
# extract_all_files(tar_file_path, extract_to)

In [3]:
import re
import os
import torch
from torch import nn
from tqdm.auto import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Regex to extract category, subcategory, and question
regex = r"([\w]+):([\w]+) (.*)"

def read_questions(filepath):
    questions = []
    categories = set()
    with open(filepath, 'r') as f:
        for line in f:
            match = re.search(regex, line)
            if match:
                category = match.group(1)
                subcategory = match.group(2)
                question = match.group(3)
                questions.append((category, subcategory, question))
                categories.add(category)
    return questions, list(categories)

def preprocess(questions, tokenizer):
    input_ids = []
    attention_masks = []
    for cat, subcat, q in questions:
        encoded = tokenizer(q, truncation=True, padding='max_length')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return input_ids, attention_masks

# Paths
dataset_dir = "./datasets"
filename = "TREC_test.txt"
filepath = os.path.join(dataset_dir, filename)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Parse and tokenize
questions, categories = read_questions(filepath)
input_ids, attention_masks = preprocess(questions, tokenizer)

# Assign labels dynamically to categories
label_map = {cat: label for label, cat in enumerate(categories)}

# Prepare labels based on the assigned labels for categories
labels = [label_map[cat] for cat, _, _ in questions]

In [6]:
import csv
import re
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.optim.lr_scheduler import LambdaLR

print("DONE IMPORTING")

DONE IMPORTING


In [7]:
# Load class labels
with open('datasets/yahoo_answers_csv/classes.txt') as f:
    categories = [line.strip() for line in f] 

# Load dataset (train and validation)
texts = [] 
labels = []
with open('datasets/yahoo_answers_csv/train.csv') as f:
    reader = csv.reader(f)
    next(reader)  # Skip header
    for row in reader:
        label = int(row[0]) - 1  # Class index starts from 1
        text = f"{row[1]} {row[2]}"  # Use f-strings for concatenation
        text = re.sub(r'\\"', '"', text)  # Unescape quotes
        text = re.sub(r'\\n', '', text)
        texts.append(text)
        labels.append(label)

# Tokenize        
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')   
encoded_texts = tokenizer(texts, truncation=True, padding='max_length', return_tensors='pt')

input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']
labels = torch.tensor(labels)

print(f"{len(input_ids)} examples tokenized")

1399999 examples tokenized


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Splitting into train and validation
train_size = int(0.9 * len(input_ids))
val_size = len(input_ids) - train_size

train_dataset = TensorDataset(input_ids[:train_size], attention_masks[:train_size], labels[:train_size])
val_dataset = TensorDataset(input_ids[train_size:], attention_masks[train_size:], labels[train_size:])

batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=RandomSampler(val_dataset),
    batch_size=batch_size
)

# Model architecture
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(categories)
)

model.to(device)

# Optimizer with custom hyperparameters for Adam
optimizer = AdamW(
    model.parameters(),
    lr=2e-5,
    eps=1e-8,
    betas=(0.9, 0.999)  # Set beta1 and beta2 here
)

# Define the slanted triangular learning rate scheduler
total_steps = len(train_dataloader) * 4  # Total number of training steps for 4 epochs
warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup

def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    else:
        return max(
            0.0,
            float(total_steps - current_step) / float(max(1, total_steps - warmup_steps))
        )

scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)

# Training loop with learning rate scheduler and custom Adam optimizer
best_val_loss = float('inf')
for epoch in range(1, 5):  # Training for maximum 4 epochs
    model.train()
    loss_train = 0

    for step, batch in enumerate(train_dataloader):
        input_batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], batch)}  # Move batch to device
        outputs = model(**input_batch)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, input_batch['labels'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        scheduler.step()  # Update learning rate

        loss_train += loss.item()

    print(f'Epoch {epoch} - Training loss: {loss_train / len(train_dataloader)}')

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in val_dataloader:
            input_batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], val_batch)}
            outputs = model(**input_batch)
            logits = outputs.logits
            loss = loss_fct(logits, input_batch['labels'])
            val_loss += loss.item()

    val_loss /= len(val_dataloader)
    print(f'Epoch {epoch} - Validation loss: {val_loss}')

    # Save the best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')  # Saving the model

Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

KeyboardInterrupt: 