In [None]:
import json
from datasets import Dataset
from transformers import (
    LlamaTokenizer, 
    LlamaForCausalLM, 
    TrainingArguments, 
    Trainer
)
batch_size = 1024 # Adjust this based on your memory capacity
# Load data from JSON file
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        items = json.load(f)
        for item in items:
            text = f"Instruction: {item['instruction']}\nResponse: {item['response']}"
            data.append(text)
    return data

# Prepare dataset
def prepare_dataset(data):
    return Dataset.from_dict({"text": data})

def tokenize_data(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Load model and tokenizer

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3",use_auth_token="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")
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3",use_auth_token="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")

# Load and prepare data
print("--> prepare the data")
data = load_data("data.json")
dataset = prepare_dataset(data)
tokenized_dataset = dataset.map(tokenize_data, batched=True, batch_size=batch_size, num_proc=4)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=200,     # log every 200 steps
)

# Initialize Trainer
print("-->  init training")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("-->  start training")
# Start training
trainer.train()

# Save the model
print("-->  saving the model")
trainer.save_model("./finetuned_llama3larger")


In [None]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer
)
import torch
from tqdm import tqdm

batch_size = 1024  # Adjust this based on your memory capacity

# Load data from JSON file
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        items = json.load(f)
        for item in items:
            text = f"Instruction: {item['instruction']}\nResponse: {item['response']}"
            data.append(text)
    return data

# Prepare dataset
def prepare_dataset(data):
    return Dataset.from_dict({"text": data})

# Tokenize data
def tokenize_data(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3", use_auth_token="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")
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", use_auth_token="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")

# Move model to the appropriate device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load and prepare data
print("--> Prepare the data")
data = load_data("data.json")
dataset = prepare_dataset(data)
tokenized_dataset = dataset.map(tokenize_data, batched=True, batch_size=batch_size, num_proc=4)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=200,     # log every 200 steps
)

# Initialize Trainer
print("--> Init training")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("--> Start training")
# Start training
trainer.train()

# Save the model
print("--> Saving the model")
trainer.save_model("./llamaLarger")

print("--> Saved the model")

# Function to generate text with tqdm progress bar



In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

# Function to load data from JSON file
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Function to prepare dataset
def prepare_dataset(data):
    texts = [f"Instruction: {item['instruction']}\nResponse: {item['response']}" for item in data]
    return Dataset.from_dict({"text": texts})

# Function to tokenize data
def tokenize_data(examples, tokenizer):
    model_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Initialize tokenizer and model
def initialize_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="eager")
    model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

# Function to define the training arguments
def define_training_args(output_dir="./results"):
    return TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        save_steps=10_000,
        save_total_limit=2,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        use_mps_device=True,
    )

# Function to define the Trainer
def create_trainer(model, args, train_dataset, tokenizer):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

# Function to fine-tune and train the model
def train_model(trainer, output_dir):
    trainer.train()
    trainer.save_model(output_dir)

# Main function to orchestrate the training process
def main():
    # Set the device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model and tokenizer
    model_name = "ybelkada/falcon-7b-sharded-bf16"
    tokenizer, model = initialize_model_and_tokenizer(model_name)

    # Move model to the appropriate device
    model.to(device)

    # Load and prepare data
    data = load_data("data.json")
    dataset = prepare_dataset(data)
    tokenized_dataset = dataset.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=True,
        num_proc=4,
        remove_columns=dataset.column_names
    )

    # Define training arguments
    training_args = define_training_args()

    # Create Trainer
    trainer = create_trainer(model, training_args, tokenized_dataset, tokenizer)

    # Train model
    output_dir = "./llamaLarger1"
    train_model(trainer, output_dir)

if __name__ == "__main__":
    main()

In [None]:
def generate_text(prompts, model, tokenizer, device, temperature=0.7, max_length=200):
    model.eval()
    generated_texts = []

    with torch.no_grad():
        for prompt in tqdm(prompts, desc="Generating texts"):
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            output = model.generate(**inputs, max_length=max_length, temperature=temperature)
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
            generated_texts.append(generated_text)
    
    return generated_texts

# Example usage of the generate_text function
prompts = ["create a complete pegasus workflow using python for machine learning"]
generated_texts = generate_text(prompts, model, tokenizer, device)

for text in generated_texts:
    print(text)

In [3]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

# Specify the paths to the saved model and tokenizer
model_path = "./finetuned_mistralLarger"
#tokenizer_path = "pthornton614/CodeLama-7b-Instruct"  # Same as the path used during fine-tuning
tokenizer_path = "mistralai/Mistral-7B-v0.3"  # Same as the path used during fine-tuning

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)

# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded.")

# Load the model
print("Loading model...")
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = LlamaForCausalLM.from_pretrained(model_path).to(device)
print("Model loaded.")


Loading tokenizer...


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Tokenizer loaded.
Loading model...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Model loaded.


In [8]:
# Prepare a prompt
from tqdm import tqdm


prompt = "Instruction: Create a complete example of pegasus workflow using the python also with replicas and sites \nResponse:"

# Tokenize the prompt and provide attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=False,max_length=500)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

# Generate text
print("Generating text...")
# Generate text with diverse decoding strategies


print("Generating text...")
with torch.no_grad():
    for _ in tqdm(range(1), desc="Generating sequences"):
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=500,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            temperature = 0.3# Adjust the temperature value as needed

        )
# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

Generating text...
Generating text...


Generating sequences: 100%|██████████| 1/1 [02:09<00:00, 129.51s/it]

Generated text:
Instruction: Create a complete example of pegasus workflow using the python also with replicas and sites 
Response: from Pegasus.api import *

# Define a workflow
wf = Workflow("replica-example")

# Add a job with a replica
job = Job("process")
job.add_inputs(File("raw_data.txt"))
job.add_args(File("raw_data.txt"))
job.add_args(File("processed_data.txt"))
job.add_args(File("processed_data.txt"))
job.add_args(File("aggregated_data.txt"))
job.add_args(File("aggregated_data.txt"))
job.add_args(File("results.txt"))
job.add_args(File("results.txt"))
job.add_args(File("summary.txt"))
job.add_args(File("summary.txt"))
job.add_args(File("final_results.txt"))
job.add_args(File("final_results.txt"))
job.add_args(File("final_summary.txt"))
job.add_args(File("final_summary.txt"))
job.add_args(File("final_results_with_planning.txt"))
job.add_args(File("final_results_with_planning.txt"))
job.add_args(File("final_summary_with_planning.txt"))
job.add_args(File("final_summary_with_plann




In [22]:
# Prepare a prompt
from tqdm import tqdm

slurm_script = """#!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}
rm -rf ${SCRATCH_DIRECTORY}

# Happy end
exit 0
"""

prompt = f"Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. {slurm_script}\nResponse:"

# Tokenize the prompt and provide attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True,max_length=1000)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

# Generate text
print("Generating text...")
# Generate text with diverse decoding strategies


print("Generating text...")
with torch.no_grad():
    for _ in tqdm(range(1), desc="Generating sequences"):
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=1000,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            top_k=50,
            do_sample=True
        )
# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

Generating text...
Generating text...


Generating sequences: 100%|██████████| 1/1 [01:12<00:00, 72.09s/it]

Generated text:
Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. #!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}





In [26]:
prompt = f"""Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. 
Your response should be a complete Python script that includes:
1. Necessary imports from Pegasus.api
2. Workflow creation
3. Job definitions that replicate the SLURM script functionality
4. Transformation, Site, and Replica catalogs as needed
5. Writing the workflow to a file

SLURM script:
{slurm_script}

Begin your response with 'from Pegasus.api import *' and end it with 'workflow.write()'.
Do not repeat code unnecessarily. Ensure each part of the SLURM script functionality is addressed only once in the Pegasus workflow.

Response:
"""
outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=1000,  # Increase max length for longer outputs
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,  # Enable sampling
    temperature=0.2,  # Adjust temperature (lower for more focused outputs)
    top_k=50,  # Limit to top k tokens
    top_p=0.95,  # Nucleus sampling
    no_repeat_ngram_size=3,  # Prevent repetition of 3-grams
    early_stopping=True,
    repetition_penalty=1.2  # Penalize repetition
)

In [27]:
#best  Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

Generated text:
Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. #!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}


In [31]:
prompt = f"""Instruction: Can you write Pegasus workflow using the Pegasus.api Python package for Federated Learning. 
Your response should be a complete Python script that includes:
1. Necessary imports from Pegasus.api
2. Workflow creation
3. Job definitions that replicate the SLURM script functionality
4. Transformation, Site, and Replica catalogs as needed
5. Writing the workflow to a file

Begin your response with 'from Pegasus.api import *' and end it with 'workflow.write()'.
Do not repeat code unnecessarily. Ensure each part of the SLURM script functionality is addressed only once in the Pegasus workflow.

Response:
"""

# Tokenize the prompt and provide attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True,max_length=1000)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=1000,  # Increase max length for longer outputs
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,  # Enable sampling
    temperature=0.8,  # Adjust temperature (lower for more focused outputs)
    top_k=50,  # Limit to top k tokens
    top_p=0.95,  # Nucleus sampling
    no_repeat_ngram_size=3,  # Prevent repetition of 3-grams
    early_stopping=True,
    repetition_penalty=1.2  # Penalize repetition
)
# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

Generated text:
Instruction: Can you write Pegasus workflow using the Pegasus.api Python package for Federated Learning. 
Your response should be a complete Python script that includes:
1. Necessary imports from Pegasus.api
2. Workflow creation
3. Job definitions that replicate the SLURM script functionality
4. Transformation, Site, and Replica catalogs as needed
5. Writing the workflow to a file

Begin your response with 'from Pegasus.api import *' and end it with 'workflow.write()'.
Do not repeat code unnecessarily. Ensure each part of the SLURM script functionality is addressed only once in the Pegasus workflow.

Response:
# --- Write the work flow to a DAG file ---
wf = WorkFlow("diamond-cluster", infer_dependencies=True)
job = (Job(submitdir='/home/ryan/Work', cmd=top_directory / "bin/pegasus-kickstart", args=args[0]) & check_inputs(File('in.txt'), File('intermediate.txt')) >> output_files[0] & input_files[-1]
job.add_args(*args, **kwargs)
logfile = LogFile(__name__)
job += add_mo

In [None]:
#repare a prompt
from tqdm import tqdm


prompt = "Instruction:how to submit the pegasus workflows\nResponse:"

# Tokenize the prompt and provide attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True,max_length=1000)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

# Generate text
print("Generating text...")
# Generate text with diverse decoding strategies


print("Generating text...")
with torch.no_grad():
    for _ in tqdm(range(1), desc="Generating sequences"):
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=1000,
            pad_token_id=tokenizer.eos_token_id,
            temperature = 0.3 # Adjust the temperature value as needed

        )
# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

In [None]:
# Set the model to evaluation mode
model.eval()

# Define dummy input for the model (adjust dimensions as necessary)
dummy_input = torch.randint(0, 50256, (1, 1024))  # Replace with actual vocab size and input length

# Export the model to ONNX format
onnx_model_path = "model.onnx"
torch.onnx.export(model, dummy_input, onnx_model_path, 
                  input_names=['input_ids'], output_names=['logits'], 
                  dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}, 'logits': {0: 'batch_size', 1: 'sequence_length'}})


In [None]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
from tqdm import tqdm
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
BATCH_SIZE = 8  # Adjust this based on your memory capacity
MAX_LENGTH = 1024
MODEL_NAME = "google/gemma-2-9b"
AUTH_TOKEN = "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"
OUTPUT_DIR = "./results"
MODEL_SAVE_PATH = "./gemmaLarger"

def load_data(file_path):
    """Load data from JSON file."""
    with open(file_path, 'r') as f:
        items = json.load(f)
    return [f"Instruction: {item['instruction']}\nResponse: {item['response']}" for item in items]

def prepare_dataset(data):
    """Prepare dataset from data."""
    return Dataset.from_dict({"text": data})

def tokenize_data(examples, tokenizer):
    """Tokenize data."""
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

def setup_model_and_tokenizer():
    """Set up model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=AUTH_TOKEN)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=AUTH_TOKEN, trust_remote_code=True)
    
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        logger.info("Using MPS (Metal Performance Shaders) device")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        logger.info("Using CUDA device")
    else:
        device = torch.device("cpu")
        logger.info("Using CPU")
    
    model.to("mps")
    # Allocate some dummy data to warm up the model on the device
    with torch.no_grad():
        dummy_input = torch.ones((1, 1), dtype=torch.int64).to("mps")
        _ = model(dummy_input)
    
    return model, tokenizer, device

class MPSTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def _prepare_inputs(self, inputs):
        """
        Prepare inputs for the model, moving them to the correct device.
        """
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to("mps")
        return inputs
    
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Custom compute_loss function to ensure loss is computed on the correct device.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Move labels to the same device as logits
        labels = labels.to("mps")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss = loss.cpu()
        return (loss, outputs) if return_outputs else loss

def main():
    model, tokenizer, device = setup_model_and_tokenizer()

    logger.info("Preparing the data")
    data = load_data("data.json")  # Ensure this file path is correct
    dataset = prepare_dataset(data)
    tokenized_dataset = dataset.map(
        lambda examples: tokenize_data(examples, tokenizer),
        batched=True,
        batch_size=BATCH_SIZE,
        num_proc=os.cpu_count(),
        remove_columns=["text"]
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        gradient_accumulation_steps=4,
        eval_steps=1000,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        use_mps_device=True
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    logger.info("Initializing trainer")
    trainer = MPSTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )

    logger.info("Starting training")
    trainer.train()

    logger.info(f"Saving the model to {MODEL_SAVE_PATH}")
    trainer.save_model(MODEL_SAVE_PATH)
    logger.info("Model saved successfully")

if __name__ == "__main__":
    main()


In [None]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
from tqdm import tqdm
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
BATCH_SIZE = 32  # Reduced batch size for CPU training
MAX_LENGTH = 1024
MODEL_NAME = "google/gemma-2-9b"
AUTH_TOKEN = "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"
OUTPUT_DIR = "./results"
MODEL_SAVE_PATH = "./gemmaLarger"

def load_data(file_path):
    """Load data from JSON file."""
    with open(file_path, 'r') as f:
        items = json.load(f)
    return [f"Instruction: {item['instruction']}\nResponse: {item['response']}" for item in items]

def prepare_dataset(data):
    """Prepare dataset from data."""
    return Dataset.from_dict({"text": data})

def tokenize_data(examples, tokenizer):
    """Tokenize data."""
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

def setup_model_and_tokenizer():
    """Set up model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=AUTH_TOKEN)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=AUTH_TOKEN)
    
    device = torch.device("cpu")
    logger.info("Forcing CPU usage")
    
    model.to(device)
    
    return model, tokenizer

def main():
    model, tokenizer = setup_model_and_tokenizer()

    logger.info("Preparing the data")
    data = load_data("data.json")
    dataset = prepare_dataset(data)
    tokenized_dataset = dataset.map(
        lambda examples: tokenize_data(examples, tokenizer),
        batched=True,
        batch_size=BATCH_SIZE,
        num_proc=1  # Set to 1 for CPU
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=1,
        per_device_train_batch_size=2,  # Reduced for CPU
        save_steps=10_000,
        save_total_limit=2,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        fp16=False,  # Disable fp16
        bf16=False,  # Disable bf16
        gradient_accumulation_steps=8,  # Increased for CPU
        eval_steps=1000,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    logger.info("Initializing trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )

    logger.info("Starting training")
    trainer.train()

    logger.info(f"Saving the model to {MODEL_SAVE_PATH}")
    trainer.save_model(MODEL_SAVE_PATH)
    logger.info("Model saved successfully")

if __name__ == "__main__":
    main()