In [3]:
import re
def sanitizeText(stringText):
  if isinstance(stringText, str): return re.sub(r'[^A-Za-z0-9\s]+', '', stringText).strip()
  else: return ''

In [4]:
from json import load
with open('bookSummaries.json','r',encoding='utf-16') as file: bookSummaries = load(file)
with open('bookReviews.json','r',encoding='utf-16') as file: reviewTexts2 = load(file)
import pandas as pd
bookContent = pd.DataFrame(bookSummaries,columns=['bookTitle','bookContent'])
bookContent['bookTitle'] = bookContent['bookTitle'].apply(sanitizeText)
bookContent = bookContent[bookContent['bookContent'].apply(lambda x: len(str(x).split())>10)].reset_index(drop=True)
bookContent.drop_duplicates().reset_index(drop=True,inplace=True)
bookContent['bookContent'] = bookContent['bookContent'].apply(lambda x: x.replace('Back to store\n',''))
bookReviews = pd.DataFrame(reviewTexts2,columns=['bookTitle','reviews'])
bookReviews['bookTitle'] = bookReviews['bookTitle'].apply(sanitizeText)
merged_books_df = pd.merge(bookReviews, bookContent, on='bookTitle', how='inner')
display(merged_books_df.head())

Unnamed: 0,bookTitle,reviews,bookContent
0,Toxic Turkey,Toxic Turkey by Gage Irving is a captivating m...,Toxic Turkey\n“Infinite wealth waited for them...
1,Love All the Way,Love All the Way is labeled as a romance novel...,Love All the Way\nCopyright © 2021 by Aurora C...
2,Love All the Way,Melanie Foster and Allen Shandi met on Septemb...,Love All the Way\nCopyright © 2021 by Aurora C...
3,Love All the Way,"Melanie, a fiery young woman, meets Allen when...",Love All the Way\nCopyright © 2021 by Aurora C...
4,Love All the Way,Love All the Way by Aurora Carafe is a heartwa...,Love All the Way\nCopyright © 2021 by Aurora C...


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "EleutherAI/gpt-neo-125M"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_max_length = model.config.max_position_embeddings
print(f"Model: {model_name}")
print(f"Model max length: {model_max_length}")

Model: EleutherAI/gpt-neo-125M
Model max length: 2048


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
modelName = "sshleifer/distilbart-cnn-12-6"  # Smaller BART model for summarization
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(modelName)
summarizer_tokenizer = AutoTokenizer.from_pretrained(modelName)

# Model's max length
summarizer_model_max_length = summarizer_model.config.max_position_embeddings
print(f"Model: {modelName}")
print(f"Model max length: {summarizer_model_max_length}")

Model: sshleifer/distilbart-cnn-12-6
Model max length: 1024


In [6]:
#merged_df = pd.DataFrame(processed_reviews_manual,columns=['bookTitle','bookContent','reviews'])#.to_csv('bookReview_summarized.csv',index=False)
merged_df = pd.read_csv('bookReview_summarized.csv')

In [8]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling # Keep import for later use if needed

def combine_text(row):
    return f"Book Title: {row['bookTitle']}\nBook Content: {row['bookContent']}\nReview: {row['reviews']}"

merged_df['combined_text'] = merged_df.apply(combine_text, axis=1)

# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(merged_df)

# Set a padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the combined text with truncation but without padding here
def tokenize_function(examples):
    # Use the tokenizer loaded earlier
    # Set truncation to True to ensure sequences are not longer than model_max_length
    # Remove padding="max_length" from this step
    return tokenizer(examples["combined_text"], truncation=True, max_length=model_max_length)

# Remove '__index_level_0__' from the remove_columns list (if it exists after using merged_df)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['bookTitle', 'reviews', 'bookContent', 'combined_text'])

# Data blocking: Concatenate and split into fixed-size chunks
# This ensures all input sequences have the same length (model_max_length)
def group_texts(examples):
    # Concatenate all texts from the batch
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Drop the last incomplete chunk if it exists
    total_length = (total_length // model_max_length) * model_max_length
    # Split by chunks of max_len
    result = {
        k: [t[i : i + model_max_length] for i in range(0, total_length, model_max_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy() # For causal language modeling, labels are the same as input_ids
    return result

# Apply the data blocking function to the tokenized dataset
lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000, # Process in larger batches for efficiency
    num_proc=4, # Use multiple processes for faster mapping
)


# Print some information about the processed dataset
print(lm_dataset)
print(lm_dataset[0])

# Split the blocked dataset into training and validation sets
train_test_split = lm_dataset.train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(eval_dataset))

Map:   0%|          | 0/349 [00:00<?, ? examples/s]

Map: 100%|██████████| 349/349 [00:00<00:00, 446.27 examples/s]
Map (num_proc=4): 100%|██████████| 349/349 [00:00<00:00, 1068.66 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 90
})
{'input_ids': [10482, 11851, 25, 29274, 7137, 198, 10482, 14041, 25, 29274, 7137, 198, 447, 250, 18943, 9504, 5129, 13488, 329, 606, 379, 262, 5461, 1627, 198, 259, 257, 3234, 326, 40424, 606, 656, 4692, 12, 50132, 5123, 13, 447, 251, 198, 38, 496, 26101, 198, 51, 18047, 7137, 198, 15269, 10673, 3738, 36743, 5857, 11739, 198, 1797, 15766, 25, 41417, 12, 16, 12, 5607, 486, 4310, 12, 2920, 12, 16, 198, 20344, 3890, 25, 44211, 4897, 5834, 198, 27245, 7412, 25, 37356, 198, 2949, 636, 286, 428, 9207, 743, 307, 31759, 393, 18307, 287, 597, 1296, 393, 416, 597, 1724, 11, 13028, 11, 7914, 11, 49857, 11081, 11, 8296, 11, 393, 416, 597, 1321, 6143, 45069, 1080, 960, 16341, 329, 39146, 973, 329, 3199, 2423, 960, 19419, 262, 3194, 7170, 286, 262, 6434, 13, 198, 14772, 6669, 1653, 23499, 198, 53, 3529, 8511, 11, 4744, 198, 464, 38315, 2304, 385, 2254, 198, 75, 1689, 1653, 12984, 20020, 31, 14816, 13, 785, 198, 35

In [None]:
from transformers import TrainingArguments
import torch
# You might need to install and import torch_xla for actual TPU detection and usage
try:
    import torch_xla.core.xla_model as xm
    _has_tpu = True
except ImportError:
    _has_tpu = False

device = xm.xla_device() if _has_tpu else ("cuda" if torch.cuda.is_available() else "cpu")

# Set batch sizes based on device
# Assuming 'cuda' for GPU, 'cpu' for CPU, and a hypothetical 'tpu'
# You would need to adjust this logic based on how you detect your TPU
# current_device = "cuda" if torch.cuda.is_available() else "cpu"
# if _has_tpu: current_device = "tpu"

if device == "cuda":
    train_batch_size = 8  # Reduced batch size for GPU
    eval_batch_size = 8 # Reduced batch size for GPU
elif _has_tpu: # Check device type for XLA (TPU)
    # TPU batch size is typically per core, and you have multiple cores (e.g., 8 for v2-8)
    # You would set a per-core batch size and the Trainer handles distribution
    # Let's use a larger per-core batch size for TPU example
    train_batch_size = 7 # Example per-core batch size for TPU
    eval_batch_size = 7 # Example per-core batch size for TPU
else: # Default to CPU
    train_batch_size = 1  # Smaller batch size for CPU
    eval_batch_size = 1 # Smaller batch size for CPU

'''
# Define training arguments
training_args = TrainingArguments(
output_dir="./output/gpt-neo-fine-tuned",  # Directory to save checkpoints and logs
overwrite_output_dir=True,
num_train_epochs=3,  # Number of training epochs
per_device_train_batch_size=train_batch_size,  # Batch size for training based on device
per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation based on device
learning_rate=5e-5,  # Learning rate
weight_decay=0.01,
eval_strategy="epoch",  # Evaluate at the end of each epoch
save_strategy="epoch", # Save checkpoint at the end of each epoch
load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
metric_for_best_model="eval_loss", # Metric to monitor for best model
logging_dir="./logs", # Directory for storing logs
logging_steps=10, # Log training progress every 10 steps
report_to="none" # Disable reporting to external services like Weights & Biases
)


training_args = TrainingArguments(
output_dir="./output/gpt-neo-fine-tuned",  # Directory to save checkpoints and logs
overwrite_output_dir=True,
num_train_epochs=3,  # Number of training epochs
per_device_train_batch_size=train_batch_size,  # Batch size for training based on device
per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation based on device
learning_rate=5e-5,  # Learning rate
weight_decay=0.01,
eval_strategy="epoch",  # Evaluate at the end of each epoch
save_strategy="epoch", # Save checkpoint at the end of each epoch
load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
metric_for_best_model="eval_loss", # Metric to monitor for best model
logging_dir="./logs", # Directory for storing logs
logging_steps=10, # Log training progress every 10 steps
report_to="none", # Disable reporting to external services like Weights & Biases
fp16=False, # Ensure fp16 is not used by default
bf16=False # Explicitly disable bf16
)
print(training_args)
'''
training_args = TrainingArguments(
output_dir="./output/gpt-neo-fine-tuned",  # Directory to save checkpoints and logs
overwrite_output_dir=True,
num_train_epochs=3,  # Number of training epochs
per_device_train_batch_size=train_batch_size,  # Batch size for training based on device
per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation based on device
learning_rate=5e-5,  # Learning rate
weight_decay=0.01,
eval_strategy="epoch",  # Evaluate at the end of each epoch
save_strategy="epoch", # Save checkpoint at the end of each epoch
load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
metric_for_best_model="eval_loss", # Metric to monitor for best model
logging_dir="./logs", # Directory for storing logs
logging_steps=10, # Log training progress every 10 steps
report_to="none", # Disable reporting to external services like Weights & Biases
fp16=False, # Ensure fp16 is not used by default
bf16=False # Explicitly disable bf16
)

  if _has_tpu: device = xm.xla_device()


In [8]:
import torch
try:
    import torch_xla.core.xla_model as xm
    _has_tpu = True
except ImportError:
    _has_tpu = False

# Set the device to TPU if available
device = xm.xla_device() if _has_tpu else ("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device
model.to(device)

# Ensure the model is in `float32` (default for TPU)
model = model.to(torch.float32)  # Set to float32 explicitly


  device = xm.xla_device() if _has_tpu else ("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
from accelerate import Accelerator
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch_xla.core.xla_model as xm

# Initialize Accelerator with no mixed precision
accelerator = Accelerator(mixed_precision="no")  # Explicitly disable mixed precision

# Set device to TPU
device = 'cpu'

# Move the model to TPU and ensure float32 precision
model = model.to(device).to(torch.float32)
'''
# Define TrainingArguments with no mixed precision enabled
training_args = TrainingArguments(
    output_dir="./output/gpt-neo-fine-tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    fp16=False,  # Disable FP16 precision
    bf16=False,  # Disable BF16 precision
    report_to="none",  # Disable external reporting
)
'''
# Create Trainer with no mixed precision and manually disable any fused operations
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your training dataset
    eval_dataset=eval_dataset,  # Your evaluation dataset
)

# Prepare the trainer with accelerator (no mixed precision)
trainer = accelerator.prepare(trainer)

# Start training
trainer.train()



RuntimeError: `fused=True` requires all the params to be floating point Tensors of supported devices: ['mps', 'cuda', 'xpu', 'hpu', 'cpu', 'privateuseone'] but torch.float32 and xla

In [None]:
from transformers import Trainer, TrainingArguments
import torch
try:
    import torch_xla.core.xla_model as xm
    _has_tpu = True
except ImportError:
    _has_tpu = False

# Set the device to TPU if available
device = xm.xla_device() if _has_tpu else ("cuda" if torch.cuda.is_available() else "cpu")

# Adjust training batch size based on device
if device == "cuda":
    train_batch_size = 8
    eval_batch_size = 8
elif device.type == "xla":  # TPU-specific handling
    train_batch_size = 7  # Adjust per-core batch size
    eval_batch_size = 7
else:
    train_batch_size = 1
    eval_batch_size = 1

# Ensure no mixed precision unless needed, avoid `fused=True`
training_args = TrainingArguments(
    output_dir="./output/gpt-neo-fine-tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    fp16=False,  # Disable FP16 mixed precision
    bf16=False,  # Disable BF16 mixed precision
)

# Initialize the Trainer with the TPU device
trainer = Trainer(
    model=model.to(device),  # Ensure model is moved to the correct device (TPU or CUDA)
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
print(f"Training on device: {trainer.args.device}")
trainer.train()

# Save the model after training
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


  device = xm.xla_device() if _has_tpu else ("cuda" if torch.cuda.is_available() else "cpu")


Training on device: xla:0




RuntimeError: `fused=True` requires all the params to be floating point Tensors of supported devices: ['mps', 'cuda', 'xpu', 'hpu', 'cpu', 'privateuseone'] but torch.float32 and xla

In [None]:
from transformers import Trainer, TrainingArguments

# Explicitly move the model to the determined device before initializing the Trainer
model.to(device)

# Initialize the Trainer
trainer = Trainer(
    model=model, # Use the model loaded earlier
    args=training_args, # Use the training arguments defined earlier
    train_dataset=train_dataset, # Use the prepared training dataset
    eval_dataset=eval_dataset, # Use the prepared evaluation dataset
    # Data collator is usually needed for dynamic padding, but since we did data blocking
    # to fixed length, it might not be strictly necessary, but it's good practice.
    # We can add DataCollatorForLanguageModeling if needed, but let's try without first.
    # data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

# Start training
print(f"Training on device: {trainer.args.device}")
trainer.train()

print("Training finished.")

# You can save the fine-tuned model if needed
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [12]:
from transformers import Trainer, DataCollatorForLanguageModeling # Import DataCollatorForLanguageModeling
from accelerate import Accelerator

if device == 'cuda':
    print(f"Detected device: {device}. Attempting to use Unsloth.")
    # Import Unsloth components
    try:
        import unsloth
        from unsloth import FastLanguageModel
        if 'FastTrainer' in dir(unsloth): 
            from unsloth import FastTrainer
            Trainer_class = FastTrainer
        elif 'UnslothTrainer' in dir(unsloth):
            from unsloth import UnslothTrainer
            Trainer_class = UnslothTrainer
        else: Trainer_class = Trainer
        # Wrap the model with Unsloth
        # You might need to adjust max_seq_length based on your tokenized data if not using max_length=model_max_length
        try: 
            model = unsloth.FastLanguageModel.from_pretrained(
model_name = model_name, # Use the original model name
max_seq_length = model_max_length,
dtype = None, # None for auto detection
load_in_4bit = True, # Load in 4bit for memory efficiency
)
            print("Model wrapped with Unsloth.")
        except ModuleNotFoundError: pass
    
    except ImportError:
        print("Unsloth not installed. Proceeding without Unsloth.")
        # If Unsloth is not installed, use the standard Trainer
        Trainer_class = Trainer
        print("Using standard Trainer.")

    except Exception as e:
        print(f"Error during Unsloth setup: {e}. Proceeding with standard Trainer.")
        # If there's an error with Unsloth setup, use the standard Trainer
        Trainer_class = Trainer
        print("Using standard Trainer.")
    
else:
    print(f"Detected device: {device}. Proceeding without Unsloth.")
    model.to('cpu')

    # Use the standard Trainer on CPU
    Trainer_class = Trainer
    print("Using standard Trainer.")

# Define the data collator for causal language modeling
# Ensure the data collator is also on the correct device if necessary,
# but DataCollatorForLanguageModeling typically handles this during batch preparation
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# Initialize the Trainer
# Explicitly pass device to TrainingArguments if available, although Trainer usually handles this
# For debugging, setting device="cpu" here might be necessary depending on the Trainer version and how it uses the device
trainer = Trainer_class( # Use the determined Trainer class
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    data_collator=data_collator, # Include the data collator
    # tokenizer=tokenizer, # Optional, can be left commented out
    # compute_metrics=compute_metrics, # Optional, can be left commented out
)
if device != 'cuda': trainer = accelerator.prepare(trainer)
# Start fine-tuning
print("Starting training...")
trainer.train()
print("Training finished.")

# You can save the fine-tuned model if needed
trainer.save_model("./gpt-neo-fine-tuned")

Detected device: cpu. Proceeding without Unsloth.


F0000 00:00:1756816466.327917   14642 pjrt_computation_client.cpp:525] Non-OK-status: status
Status: INTERNAL: Error preparing computation: Out of memory allocating 36233592832 bytes.
*** Begin stack trace ***
	tsl::CurrentStackTrace[abi:cxx11]()
	torch_xla::runtime::PjRtComputationClient::TransferFromDevice(absl::lts_20230802::Span<std::shared_ptr<torch_xla::runtime::ComputationClient::Data> const>)
	torch_xla::ReleaseGilAndTransferData(absl::lts_20230802::Span<std::shared_ptr<torch::lazy::BackendData> const>)
	torch_xla::XlaDataToTensors(absl::lts_20230802::Span<std::shared_ptr<torch::lazy::BackendData> const>, absl::lts_20230802::Span<c10::ScalarType const>)
	torch_xla::XLATensor::ToTensor(bool)
	torch_xla::XLANativeFunctions::_to_copy(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	
	at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::opti

: 