In [1]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import psutil
import torch
import os
import spacy

from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import psutil

# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")


Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 31.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 27.0 °C


In [11]:
# # USE ONLY TO EXTRACT FILES FROM TAR FILES

# import tarfile

# def extract_all_files(tar_file_path, extract_to):
#     with tarfile.open(tar_file_path, 'r') as tar:
#         tar.extractall(extract_to)

# # Example usage
# tar_file_path = 'datasets/qrels.trec8.qa.gz'
# extract_to = 'datasets/'
# extract_all_files(tar_file_path, extract_to)

ReadError: file could not be opened successfully:
- method gz: ReadError('invalid header')
- method bz2: ReadError('not a bzip2 file')
- method xz: ReadError('not an lzma file')
- method tar: ReadError('invalid header')

In [2]:
# Function to read texts from files within a folder
def read_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            texts.append(file.read().strip())
    return texts

# Function to preprocess texts
def preprocess(texts, tokenizer, max_length=512):
    all_input_ids = []
    for text in tqdm(texts):
        # Tokenize using the provided tokenizer
        tokenized = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length')
        input_ids = tokenized['input_ids']
        all_input_ids.append(input_ids)

    return all_input_ids

# Paths to the directories within aclImdb folder
aclImdb_folder = "datasets/aclImdb"
train_pos_path = os.path.join(aclImdb_folder, 'train', 'pos')
train_neg_path = os.path.join(aclImdb_folder, 'train', 'neg')

# Initialize the BERT-base-uncased tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Read and preprocess the texts from positive and negative folders
train_pos_texts = read_texts_from_folder(train_pos_path)
train_neg_texts = read_texts_from_folder(train_neg_path)
train_texts = train_pos_texts + train_neg_texts

processed = preprocess(train_texts, tokenizer)

print("EDITING FILE")

pretrain_file_path = "pretraining_text.txt"
with open(pretrain_file_path, 'w', encoding='utf-8') as pretrain_file:
    for text_ids in processed:
        text = tokenizer.decode(text_ids, skip_special_tokens=True)
        pretrain_file.write(text + '\n')

print("DONE EDITING")
        
# Create a dataset for pre-training
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=pretrain_file_path,
    block_size=512  # Adjust the block size as per your sequence length
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Probability of masking tokens
)

# Initialize the BERT masked language model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./pretrained_bert",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    warmup_steps=10000
)

# Create Trainer instance for pre-training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

print("STARTED TRAINING")

# Within the training loop
for step, batch in enumerate(train_dataloader):
    # Perform training steps
    trainer.train()

    # Print training progress
    if step % 100 == 0:
        print(f"Step {step}/{total_steps}, Loss: {loss.item()}")

print("TRAINING DONE")

100%|██████████| 25000/25000 [01:23<00:00, 299.23it/s]


In [5]:
import re
import os
import torch
from torch import nn
from tqdm.auto import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Regex to extract category, subcategory, and question
regex = r"([\w]+):([\w]+) (.*)"

def read_questions(filepath):
    questions = []
    categories = set()
    with open(filepath, 'r') as f:
        for line in f:
            match = re.search(regex, line)
            if match:
                category = match.group(1)
                subcategory = match.group(2)
                question = match.group(3)
                questions.append((category, subcategory, question))
                categories.add(category)
    return questions, list(categories)

def preprocess(questions, tokenizer):
    input_ids = []
    attention_masks = []
    for cat, subcat, q in questions:
        encoded = tokenizer(q, truncation=True, padding='max_length')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return input_ids, attention_masks

# Paths
dataset_dir = "./datasets"
filename = "TREC_test.txt"
filepath = os.path.join(dataset_dir, filename)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Parse and tokenize
questions, categories = read_questions(filepath)
input_ids, attention_masks = preprocess(questions, tokenizer)

# Assign labels dynamically to categories
label_map = {cat: label for label, cat in enumerate(categories)}

# Prepare labels based on the assigned labels for categories
labels = [label_map[cat] for cat, _, _ in questions]

In [9]:
# YAHOO DATASET READING

import csv
import re
from transformers import BertTokenizer

# Load class labels
with open('datasets/yahoo_answers_csv/classes.txt') as f:
    categories = [line.strip() for line in f] 

# Load dataset
texts = [] 
labels = []
with open('datasets/yahoo_answers_csv/train.csv') as f:
    reader = csv.reader(f)
    next(reader) # Skip header
    for row in reader:
        label = int(row[0]) - 1 # Class index starts from 1
        text = "{} {}".format(row[1], row[2]) # Title + Content
        text = re.sub(r'\\"', '"', text) # Unescape quotes
        text = re.sub(r'\\n', '', text)
        texts.append(text)
        labels.append(label)
        
# Tokenize        
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')   
input_ids = []
attn_masks = [] 

for text in texts:
    encoded = tokenizer(text, truncation=True, padding='max_length')
    input_ids.append(encoded['input_ids'])
    attn_masks.append(encoded['attention_mask'])
    
print(f"{len(input_ids)} examples tokenized")

FileNotFoundError: [Errno 2] No such file or directory: '/datasets/yahoo_answers_csv/classes.txt'

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model architecture
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(categories)
)

model.to(device)
input_ids = torch.tensor(input_ids).to(device)
attention_masks = torch.tensor(attention_masks).to(device)
labels = torch.tensor(labels).to(device)

# Dataloaders
dataset = TensorDataset(input_ids, attention_masks, labels)

batch_size = 32
dataloader = DataLoader(
    dataset,
    sampler=SequentialSampler(dataset),
    batch_size=batch_size
)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fct = nn.CrossEntropyLoss()

epochs = 3  # Define the number of epochs

# Training loop
for epoch in tqdm(range(1, epochs + 1), desc="Epoch"):
    model.train()
    loss_train = 0

    for step, batch in enumerate(dataloader):
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        logits = outputs.logits

        loss = loss_fct(logits, batch[2])

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loss_train += loss.item()

    print(f'Training loss: {loss_train / len(dataloader)}')

Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Training loss: 1.668364830315113
Training loss: 1.3145403265953064
Training loss: 1.0003667697310448
