In [2]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import psutil
import torch
import os
import spacy

from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")


Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 31.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 27.0 °C


In [4]:
# USE ONLY TO EXTRACT FILES FROM TAR FILES

import tarfile

def extract_all_files(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar:
        tar.extractall(extract_to)

# Example usage
tar_file_path = 'datasets/yahoo_answers_csv.tar.gz'
extract_to = 'datasets/'
extract_all_files(tar_file_path, extract_to)

In [2]:
import torch
from transformers import BertTokenizer
import spacy
from tqdm import tqdm
import os

# Function to read texts from files within a folder
def read_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            texts.append(file.read().strip())
    return texts

# Function to preprocess texts
def preprocess(texts, tokenizer, max_length=512):
    all_input_ids = []
    for text in tqdm(texts):
        # Tokenize using the provided tokenizer
        tokenized = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length')
        input_ids = tokenized['input_ids']
        all_input_ids.append(input_ids)

    return all_input_ids

# Paths to the directories within aclImdb folder
aclImdb_folder = "datasets/aclImdb"
train_pos_path = os.path.join(aclImdb_folder, 'train', 'pos')
train_neg_path = os.path.join(aclImdb_folder, 'train', 'neg')

# Initialize the BERT-base-uncased tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Read and preprocess the texts from positive and negative folders
train_pos_texts = read_texts_from_folder(train_pos_path)
train_neg_texts = read_texts_from_folder(train_neg_path)
train_texts = train_pos_texts + train_neg_texts

processed = preprocess(train_texts, tokenizer)

100%|██████████| 25000/25000 [01:23<00:00, 299.23it/s]


In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os

print("EDITING FILE")

pretrain_file_path = "pretraining_text.txt"
with open(pretrain_file_path, 'w', encoding='utf-8') as pretrain_file:
    for text_ids in processed:
        text = tokenizer.decode(text_ids, skip_special_tokens=True)
        pretrain_file.write(text + '\n')

print("DONE EDITING")
        
# Create a dataset for pre-training
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=pretrain_file_path,
    block_size=512  # Adjust the block size as per your sequence length
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Probability of masking tokens
)

# Initialize the BERT masked language model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./pretrained_bert_imdb",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    warmup_steps=10000
)

# Create Trainer instance for pre-training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

print("STARTED TRAINING")

# Start pre-training
trainer.train()

print("TRAINING DONE")

EDITING FILE
DONE EDITING


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


STARTED TRAINING


