In [1]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import os
import torch
import spacy
import psutil
import pandas as pd
import multiprocessing

from functools import partial
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

print("IMPORTS DONE")

IMPORTS DONE


In [2]:
# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tUUID: {gpu.uuid}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")

Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	UUID: GPU-07b54f56-7ab0-e0e3-f30e-82863135b47c
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 32.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	UUID: GPU-9f6bf722-c96f-671a-e64f-1dc0c6033940
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 29.0 °C


In [10]:
# # USE ONLY TO EXTRACT FILES FROM TAR FILES

# import tarfile

# def extract_all_files(tar_file_path, extract_to):
#     with tarfile.open(tar_file_path, 'r') as tar:
#         tar.extractall(extract_to)

# # Example usage
# tar_file_path = 'datasets/qrels.trec8.qa.gz'
# extract_to = 'datasets/'
# extract_all_files(tar_file_path, extract_to)

In [None]:
# # Load data from train.csv using pandas without header (assuming no column names)
# train_file_path = 'datasets/yahoo_answers_csv/train.csv'
# test_file_path = 'datasets/yahoo_answers_csv/test.csv'

# df_train = pd.read_csv(train_file_path, header=None)
# df_train.columns = ['id', 'part_1', 'part_2', 'answer']  # Modify this based on your data structure

# df_test = pd.read_csv(test_file_path, header=None)
# df_test.columns = ['id', 'part_1', 'part_2', 'answer']  # Modify this based on your data structure

# # Load classes.txt to associate labels with IDs
# classes_file_path = 'datasets/yahoo_answers_csv/classes.txt'
# with open(classes_file_path, 'r') as file:
#     lines = file.readlines()

# # Create a dictionary to map IDs to labels
# label_dict = {}
# for index, line in enumerate(lines):
#     label_dict[index + 1] = line.strip()  # Assign label IDs based on line number

# # Combine the two parts of the question into a single column for train and test sets
# df_train['question'] = df_train['part_1'] + " " + df_train['part_2']
# df_test['question'] = df_test['part_1'] + " " + df_test['part_2']

# # Initialize the BERT tokenizer and spaCy
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# nlp = spacy.load("en_core_web_sm")

# # Process the train and test data for BERT input
# formatted_train_data = []
# formatted_test_data = []
# max_seq_length = 512  # Maximum sequence length for BERT

# def process_text(text):
#     doc = nlp(text)
#     sentences = [sent.text for sent in doc.sents]  # Extract individual sentences
#     return sentences

# def encode_text(text):
#     encoded_dict = tokenizer.encode_plus(
#         text,
#         add_special_tokens=True,
#         max_length=max_seq_length,
#         padding='max_length',
#         truncation=True,
#         return_tensors='pt'
#     )
#     return encoded_dict

# # Process train data
# for index, row in df_train.iterrows():
#     question = str(row['question'])
#     answer = str(row['answer'])

#     combined_text = question + " " + answer
#     sentences = process_text(combined_text)

#     for sentence in sentences:
#         encoded_dict = encode_text(sentence)

#         if encoded_dict['input_ids'].shape[1] > max_seq_length:
#             input_ids = torch.cat(
#                 (encoded_dict['input_ids'][:, :128], encoded_dict['input_ids'][:, -382:]), dim=1)
#             attention_mask = torch.cat(
#                 (encoded_dict['attention_mask'][:, :128], encoded_dict['attention_mask'][:, -382:]), dim=1)
#             token_type_ids = torch.cat(
#                 (encoded_dict['token_type_ids'][:, :128], encoded_dict['token_type_ids'][:, -382:]), dim=1)
#         else:
#             input_ids = encoded_dict['input_ids']
#             attention_mask = encoded_dict['attention_mask']
#             token_type_ids = encoded_dict['token_type_ids']

#         id_number = int(row['id'])
#         label = label_dict.get(id_number, None)

#         formatted_train_data.append({
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'token_type_ids': token_type_ids,
#             'label': label
#         })

# # Process test data
# for index, row in df_test.iterrows():
#     question = str(row['question'])
#     answer = str(row['answer'])

#     combined_text = question + " " + answer
#     sentences = process_text(combined_text)

#     for sentence in sentences:
#         encoded_dict = encode_text(sentence)

#         if encoded_dict['input_ids'].shape[1] > max_seq_length:
#             input_ids = torch.cat(
#                 (encoded_dict['input_ids'][:, :128], encoded_dict['input_ids'][:, -382:]), dim=1)
#             attention_mask = torch.cat(
#                 (encoded_dict['attention_mask'][:, :128], encoded_dict['attention_mask'][:, -382:]), dim=1)
#             token_type_ids = torch.cat(
#                 (encoded_dict['token_type_ids'][:, :128], encoded_dict['token_type_ids'][:, -382:]), dim=1)
#         else:
#             input_ids = encoded_dict['input_ids']
#             attention_mask = encoded_dict['attention_mask']
#             token_type_ids = encoded_dict['token_type_ids']

#         id_number = int(row['id'])
#         label = label_dict.get(id_number, None)

#         formatted_test_data.append({
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'token_type_ids': token_type_ids,
#             'label': label
#         })

# # Display a sample of the formatted train and test data
# print("Formatted train data sample:")
# for i in range(1):  # Print the first sample from train data
#     print(formatted_train_data[i])

# print("Formatted test data sample:")
# for i in range(1):  # Print the first sample from test data
#     print(formatted_test_data[i])

In [8]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")

def segment_sentences(text):
    doc = nlp(str(text))  # Convert to string to handle potential float values
    sentences = [sent.text for sent in doc.sents]  # Extract individual sentences
    return sentences

def process_data(texts, labels, max_seq_length=512, max_head_tokens=128, max_tail_tokens=382):
    formatted_data = []

    total_texts = len(texts)

    for i, (text, label) in enumerate(zip(texts, labels), start=1):
        # Segment sentences using spaCy
        sentences = segment_sentences(text)
        processed_text = " ".join(sentences)

        # Tokenize the processed text using BERT tokenizer
        tokens = tokenizer.tokenize(processed_text)
        # Add [CLS] and [SEP] tokens
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        if len(tokens) > max_seq_length:
            # Keep the first max_head_tokens and the last max_tail_tokens
            head_tokens = tokens[1:max_head_tokens + 1]
            tail_tokens = tokens[-max_tail_tokens:]
            tokens = head_tokens + tail_tokens

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        # Padding
        padding_length = max_seq_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        attention_mask += [0] * padding_length

        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)

        # Append to formatted data including 'label'
        formatted_data.append({
            'input_ids': input_ids.unsqueeze(0),  # Unsqueeze for batch dimension
            'attention_mask': attention_mask.unsqueeze(0),  # Unsqueeze for batch dimension
            'label': label
        })

        # Print progress update
        if i % 50000 == 0 or i == total_texts:
            print(f"Processed {i}/{total_texts} texts")

    return formatted_data

# Load data using pandas
train_file_path = 'datasets/yahoo_answers_csv/train.csv'
test_file_path = 'datasets/yahoo_answers_csv/test.csv'

df_train = pd.read_csv(train_file_path, header=None)
df_train.columns = ['id', 'part_1', 'part_2', 'answer']  # Modify this based on your data structure

df_test = pd.read_csv(test_file_path, header=None)
df_test.columns = ['id', 'part_1', 'part_2', 'answer']  # Modify this based on your data structure

# Load classes.txt to associate labels with IDs
classes_file_path = 'datasets/yahoo_answers_csv/classes.txt'
with open(classes_file_path, 'r') as file:
    lines = file.readlines()

# Create a dictionary to map IDs to labels
label_dict = {}
for index, line in enumerate(lines):
    label_dict[index + 1] = line.strip()  # Assign label IDs based on line number

# Combine the two parts of the question into a single column for train and test sets
df_train['question'] = df_train['part_1'] + " " + df_train['part_2']
df_test['question'] = df_test['part_1'] + " " + df_test['part_2']

# Process the train and test data for BERT input
formatted_train_data = process_data(df_train['question'], df_train['id'], max_seq_length=512)
formatted_test_data = process_data(df_test['question'], df_test['id'], max_seq_length=512)

# Display a sample of the formatted train and test data
print("Formatted train data sample:")
for i in range(1):  # Print the first sample from train data
    print(formatted_train_data[i])

print("Formatted test data sample:")
for i in range(1):  # Print the first sample from test data
    print(formatted_test_data[i])

Processed 50000/1400000 texts
Processed 100000/1400000 texts
Processed 150000/1400000 texts
Processed 200000/1400000 texts
Processed 250000/1400000 texts
Processed 300000/1400000 texts
Processed 350000/1400000 texts
Processed 400000/1400000 texts
Processed 450000/1400000 texts
Processed 500000/1400000 texts
Processed 550000/1400000 texts
Processed 600000/1400000 texts
Processed 650000/1400000 texts
Processed 700000/1400000 texts
Processed 750000/1400000 texts
Processed 800000/1400000 texts
Processed 850000/1400000 texts
Processed 900000/1400000 texts
Processed 950000/1400000 texts
Processed 1000000/1400000 texts
Processed 1050000/1400000 texts
Processed 1100000/1400000 texts
Processed 1150000/1400000 texts
Processed 1200000/1400000 texts
Processed 1250000/1400000 texts
Processed 1300000/1400000 texts
Processed 1350000/1400000 texts
Processed 1400000/1400000 texts
Processed 50000/60000 texts
Processed 60000/60000 texts
Formatted train data sample:
{'input_ids': tensor([[ 101, 2339, 2987

In [None]:
import time

# Set CUDA_LAUNCH_BLOCKING for potential debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


# Split the data into train and validation sets
train_data, val_data = train_test_split(formatted_data, test_size=0.1, random_state=42)

# Convert data to PyTorch tensors
train_inputs = torch.cat([example['input_ids'] for example in train_data], dim=0)
train_masks = torch.cat([example['attention_mask'] for example in train_data], dim=0)
train_token_types = torch.cat([example['token_type_ids'] for example in train_data], dim=0)

val_inputs = torch.cat([example['input_ids'] for example in val_data], dim=0)
val_masks = torch.cat([example['attention_mask'] for example in val_data], dim=0)
val_token_types = torch.cat([example['token_type_ids'] for example in val_data], dim=0)

# Extract labels from the dataset
train_labels_raw = [example['label'] for example in train_data]
val_labels_raw = [example['label'] for example in val_data]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on all labels to create a mapping from label names to numerical representations
all_labels = train_labels_raw + val_labels_raw
label_encoder.fit(all_labels)

# Transform the string labels into numerical representations
train_labels = torch.tensor(label_encoder.transform(train_labels_raw))
val_labels = torch.tensor(label_encoder.transform(val_labels_raw))

# Define batch size and create DataLoader
batch_size = 24

train_dataset = TensorDataset(train_inputs, train_masks, train_token_types, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_inputs, val_masks, val_token_types, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the BERT model and tokenizer - Use BERT-Base or BERT-Large
model_name = 'bert-base-uncased'  # For BERT-Base
num_labels = len(label_dict)  # Number of labels based on classes.txt

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999))

num_epochs = 1  # You can adjust the number of epochs
accumulation_steps = 4  # Accumulate gradients over 4 steps

criterion = torch.nn.CrossEntropyLoss()

# Total Training Steps
total_steps = len(train_dataloader) * num_epochs

# Warmup Proportion
warmup_proportion = 0.1

# Calculate the warmup steps
warmup_steps = int(total_steps * warmup_proportion)

# Define the learning rate scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=2e-5,
    total_steps=total_steps,
    pct_start=float(warmup_steps) / float(total_steps),
    anneal_strategy='linear',
    div_factor=25.0,
    final_div_factor=1000.0
)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Training
    model.train()
    total_loss = 0
    print("Training...")

    start_time = time.time()  # Record start time for the training part of the epoch

    optimizer.zero_grad()

    for batch_idx, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, token_types, labels = batch

        outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss = loss / accumulation_steps
        loss.backward()

        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_dataloader):
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()  # Update learning rate

        total_loss += loss.item() * accumulation_steps
        
        if batch_idx % 50 == 0:
            print(f"Processed batch {batch_idx}/{len(train_dataloader)}")

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Train Loss: {avg_train_loss:.4f}")

    end_time = time.time()  # Record end time for the training part of the epoch
    epoch_time = end_time - start_time  # Calculate epoch training duration
    print(f"Training Duration for Epoch {epoch + 1}: {epoch_time:.2f} seconds")

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    print("Validating...")

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, token_types, labels = batch

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
            logits = outputs.logits

        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels).item()
        total_predictions += len(labels)

        val_loss += criterion(logits, labels).item()

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / total_predictions
    error_rate = 1 - accuracy

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}, Validation Error Rate: {error_rate:.4f}")

# Save the model
torch.save(model.state_dict(), 'finetuned_bert_base.pth')

In [7]:
# Sample Results

# Train Loss: 0.9030
# Validation Loss: 0.8534, Validation Accuracy: 0.7152, Validation Error Rate: 0.2848

In [None]:
batch_size = 24
train_inputs = torch.tensor([data['input_ids'] for data in train_data])
train_masks = torch.tensor([data['attention_mask'] for data in train_data])
train_token_types = torch.tensor([data['token_type_ids'] for data in train_data])
train_labels = torch.tensor([data['label'] for data in train_data])

train_dataset = TensorDataset(train_inputs, train_masks, train_token_types, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Creating DataLoader for test set
test_inputs = torch.tensor([data['input_ids'] for data in test_data])
test_masks = torch.tensor([data['attention_mask'] for data in test_data])
test_token_types = torch.tensor([data['token_type_ids'] for data in test_data])
test_labels = torch.tensor([data['label'] for data in test_data])

test_dataset = TensorDataset(test_inputs, test_masks, test_token_types, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Define the criterion (loss function)
criterion = nn.CrossEntropyLoss()

# Model Architecture - Initialize the BERT model and tokenizer
model_name = 'bert-base-uncased'
num_labels = len(label_dict)  # Number of labels based on classes.txt

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimization Settings
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999))

num_epochs = 4  # Maximum epochs
batch_size = 24
accumulation_steps = 1  # No gradient accumulation in this example

# Total Training Steps
total_steps = len(train_dataloader) * num_epochs

# Slanted triangular learning rate decay
warmup_proportion = 0.1
warmup_steps = int(total_steps * warmup_proportion)

# Define the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# Layer-wise Learning Rates
decay_factor = 0.95  # Decay factor
base_lr = 2e-5  # Base learning rate for top layer
current_lr = base_lr

for name, param in model.named_parameters():
    if "classifier" not in name:  # Exclude classification layer
        param.requires_grad = False  # Freeze layers initially

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    model.train()
    total_loss = 0

    optimizer.zero_grad()

    for batch_idx, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, token_types, labels = batch

        outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()

        total_loss += loss.item()

        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_dataloader):
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        # Adjust learning rates for lower layers
        current_lr = current_lr * decay_factor
        for name, param in model.named_parameters():
            if "classifier" not in name:  # Exclude classification layer
                param.requires_grad = True  # Unfreeze the layer
                param_lr = current_lr if "encoder.layer.11" in name else current_lr * decay_factor
                optimizer.param_groups[0]['lr'] = param_lr

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Train Loss: {avg_train_loss:.4f}")

    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, token_types, labels = batch

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
            logits = outputs.logits

        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels).item()
        total_predictions += len(labels)

        val_loss += criterion(logits, labels).item()

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / total_predictions
    error_rate = 1 - accuracy

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}, Validation Error Rate: {error_rate:.4f}")

    # Save the model after each epoch if needed
    # Modify the path as needed to save models with different names for each epoch
    torch.save(model.state_dict(), f'finetuned_bert_epoch_{epoch + 1}.pth')