In [1]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import os
import time
import torch
import spacy
import psutil
import pandas as pd
import multiprocessing

from functools import partial
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW


print("IMPORTS DONE")

IMPORTS DONE


In [4]:
# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tUUID: {gpu.uuid}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")

Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	UUID: GPU-c9f222a4-cdf9-a690-b27c-fd6ca8dd8832
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 31.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	UUID: GPU-051052d6-9db8-7e01-b26b-69a92eab9f9e
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 28.0 °C


In [16]:
# Directory paths
train_pos_dir = 'datasets/aclImdb/train/pos/'
train_neg_dir = 'datasets/aclImdb/train/neg/'
test_pos_dir = 'datasets/aclImdb/test/pos/'
test_neg_dir = 'datasets/aclImdb/test/neg/'

# Function to read text files from a directory and assign labels based on directory structure
def read_text_files_from_directory(directory):
    texts = []
    labels = []
    # For positive reviews (assuming 'pos' directory corresponds to label 1)
    if 'pos' in directory:
        label = 1
    else:
        label = 0  # For negative reviews
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            text = file.read().replace('\n', ' ')
            texts.append(text)
            labels.append(label)
    return texts, labels

# Read positive and negative training data with labels
train_pos_texts, train_pos_labels = read_text_files_from_directory(train_pos_dir)
train_neg_texts, train_neg_labels = read_text_files_from_directory(train_neg_dir)

# Read positive and negative test data with labels
test_pos_texts, test_pos_labels = read_text_files_from_directory(test_pos_dir)
test_neg_texts, test_neg_labels = read_text_files_from_directory(test_neg_dir)

# Combine texts and labels
train_texts = train_pos_texts + train_neg_texts
train_labels = train_pos_labels + train_neg_labels
test_texts = test_pos_texts + test_neg_texts
test_labels = test_pos_labels + test_neg_labels

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Process the data for BERT input
formatted_data = []
max_seq_length = 512  # Maximum sequence length for BERT

# Tokenize and format data
for text in train_texts + test_texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_seq_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_ids = encoded_dict['token_type_ids']

    formatted_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
    })

# Display the formatted data sample
print("Formatted data sample:")
for i in range(1):
    print(formatted_data[i])

# Check the lengths of labels and formatted data
print(f"Number of training samples: {len(train_labels)}")
print(f"Number of testing samples: {len(test_labels)}")
print(f"Number of formatted data samples: {len(formatted_data)}")

Formatted data sample:
{'input_ids': tensor([[  101,  1045,  3866,  2146,  2126,  2461,  1998,  2347,  1005,  1056,
          2130,  5204,  1997,  2679,  2000,  4830,  6673,  2127,  1045,  2387,
          2009,  2006,  1996, 15475,  1997,  2026,  2334, 17006,  1012,  1045,
          4149,  2009,  1998,  2044,  1037,  3621,  1005, 17012,  2097,  2023,
          2022,  2004,  2204,  1005,  2034,  2792,  1045,  2787,  2008,  2009,
          2001,  1012,  4918, 22017, 14515,  2001,  2307,  2004,  2020,  1996,
          2060,  2372,  1997,  1996,  3626,  1012,  2307,  2000,  2156,  2032,
          2007,  1041,  7447,  2153,  1012,  2045,  2001,  1037,  4189,  2978,
          1997, 25082,  1999,  2009,  2021,  2008,  2134,  1005,  1056,  8572,
          2033,  1012,  2004,  2005,  2037,  2108,  2053,  5254,  1997,  2009,
          2006,  1996,  7427,  1012,  2008,  2015,  2062,  2000,  2079,  2007,
          1996, 10021,  4654,  3401, 27718,  2013,  5579,  8196,  2008,  1996,
         22861,

In [20]:
# ... (previous code remains unchanged)

# Set CUDA_LAUNCH_BLOCKING for potential debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Split the data into train and validation sets
train_data, val_data, train_labels_raw, val_labels_raw = train_test_split(
    formatted_data, combined_labels, test_size=0.1, random_state=42
)

# Convert data to PyTorch tensors
train_inputs = torch.cat([example['input_ids'] for example in train_data], dim=0)
train_masks = torch.cat([example['attention_mask'] for example in train_data], dim=0)
train_token_types = torch.cat([example['token_type_ids'] for example in train_data], dim=0)

val_inputs = torch.cat([example['input_ids'] for example in val_data], dim=0)
val_masks = torch.cat([example['attention_mask'] for example in val_data], dim=0)
val_token_types = torch.cat([example['token_type_ids'] for example in val_data], dim=0)

# Define batch size and create DataLoader
batch_size = 24

train_labels = torch.tensor(train_labels_raw)
val_labels = torch.tensor(val_labels_raw)

train_dataset = TensorDataset(train_inputs, train_masks, train_token_types, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_inputs, val_masks, val_token_types, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the BERT model and tokenizer - Use BERT-Base or BERT-Large
model_name = 'bert-base-uncased'  # For BERT-Base
num_labels = 2  # Considering binary classification (positive/negative sentiment)

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

num_epochs = 4 # You can adjust the number of epochs

best_val_accuracy = 0.0
best_epoch = 0

# Training and Validation loop with epoch timing
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    start_time = time.time()  # Start time for epoch

    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, token_types, labels = batch

        optimizer.zero_grad()

        outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
        logits = outputs.logits

        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % 250 == 0:
            print(f"Processed batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Train Loss: {avg_train_loss:.4f}")
    
    end_time = time.time()  # End time for epoch
    epoch_time = end_time - start_time  # Calculate epoch time
    print(f"Epoch Training Time: {epoch_time:.2f} seconds")

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs, masks, token_types, labels = batch

            outputs = model(inputs, attention_mask=masks, token_type_ids=token_types)
            logits = outputs.logits

            loss = torch.nn.functional.cross_entropy(logits, labels)
            val_loss += loss.item()

            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = correct_predictions / len(val_dataset)

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_epoch = epoch
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')

    end_time = time.time()  # End time for epoch
    epoch_time = end_time - start_time  # Calculate epoch time
    print(f"Full Epoch Time: {epoch_time:.2f} seconds")

print(f"Best validation accuracy of {best_val_accuracy*100:.2f} achieved at epoch {best_epoch + 1}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Epoch 1/4
Processed batch 250/1875, Loss: 0.1830
Processed batch 500/1875, Loss: 0.3208
Processed batch 750/1875, Loss: 0.2559
Processed batch 1000/1875, Loss: 0.1775
Processed batch 1250/1875, Loss: 0.2661
Processed batch 1500/1875, Loss: 0.1525
Processed batch 1750/1875, Loss: 0.1152
Train Loss: 0.2053
Epoch Training Time: 1040.72 seconds
Validation Loss: 0.1506, Validation Accuracy: 94.48%
Full Epoch Time: 1076.97 seconds

Epoch 2/4
Processed batch 250/1875, Loss: 0.2292
Processed batch 500/1875, Loss: 0.0455
Processed batch 750/1875, Loss: 0.0326
Processed batch 1000/1875, Loss: 0.2025
Processed batch 1250/1875, Loss: 0.0547
Processed batch 1500/1875, Loss: 0.1200
Processed batch 1750/1875, Loss: 0.6036
Train Loss: 0.1100
Epoch Training Time: 1039.21 seconds
Validation Loss: 0.1653, Validation Accuracy: 94.30%
Full Epoch Time: 1074.79 seconds

Epoch 3/4
Processed batch 250/1875, Loss: 0.0105
Processed batch 500/1875, Loss: 0.0047
Processed batch 750/1875, Loss: 0.0079
Processed ba

SAMPLE OUTPUT

Epoch 1/4
Processed batch 250/1875, Loss: 0.1830
Processed batch 500/1875, Loss: 0.3208
Processed batch 750/1875, Loss: 0.2559
Processed batch 1000/1875, Loss: 0.1775
Processed batch 1250/1875, Loss: 0.2661
Processed batch 1500/1875, Loss: 0.1525
Processed batch 1750/1875, Loss: 0.1152
Train Loss: 0.2053
Epoch Training Time: 1040.72 seconds
Validation Loss: 0.1506, Validation Accuracy: 94.48%
Full Epoch Time: 1076.97 seconds

Epoch 2/4
Processed batch 250/1875, Loss: 0.2292
Processed batch 500/1875, Loss: 0.0455
Processed batch 750/1875, Loss: 0.0326
Processed batch 1000/1875, Loss: 0.2025
Processed batch 1250/1875, Loss: 0.0547
Processed batch 1500/1875, Loss: 0.1200
Processed batch 1750/1875, Loss: 0.6036
Train Loss: 0.1100
Epoch Training Time: 1039.21 seconds
Validation Loss: 0.1653, Validation Accuracy: 94.30%
Full Epoch Time: 1074.79 seconds

Epoch 3/4
Processed batch 250/1875, Loss: 0.0105
Processed batch 500/1875, Loss: 0.0047
Processed batch 750/1875, Loss: 0.0079
Processed batch 1000/1875, Loss: 0.0037
Processed batch 1250/1875, Loss: 0.0273
Processed batch 1500/1875, Loss: 0.0186
Processed batch 1750/1875, Loss: 0.0774
Train Loss: 0.0589
Epoch Training Time: 1040.02 seconds
Validation Loss: 0.1788, Validation Accuracy: 94.20%
Full Epoch Time: 1075.59 seconds

Epoch 4/4
Processed batch 250/1875, Loss: 0.0176
Processed batch 500/1875, Loss: 0.0006
Processed batch 750/1875, Loss: 0.0062
Processed batch 1000/1875, Loss: 0.0101
Processed batch 1250/1875, Loss: 0.0044
Processed batch 1500/1875, Loss: 0.0190
Processed batch 1750/1875, Loss: 0.0028
Train Loss: 0.0353
Epoch Training Time: 1040.06 seconds
Validation Loss: 0.2013, Validation Accuracy: 94.62%
Full Epoch Time: 1075.99 seconds
Best validation accuracy of 94.62 achieved at epoch 4