In [None]:
"""
Mistral-7B Language Model Fine-Tuning Script

This script fine-tunes the Mistral-7B language model on a custom dataset. It uses the Hugging Face 
Transformers library to load the pre-trained model and tokenizer, prepare the dataset, and perform 
the fine-tuning process.

Key features:
1. Loads and prepares data from a JSON file
2. Uses the Mistral-7B model and tokenizer
3. Tokenizes the dataset for training
4. Configures training arguments
5. Performs fine-tuning using the Hugging Face Trainer
6. Saves the fine-tuned model

The script is designed to work with instruction-response pairs stored in a JSON file. It formats 
the data, tokenizes it, and then fine-tunes the model on this dataset.

Note: This script requires access to the Mistral-7B model, which may need authentication. 
Make sure you have the necessary permissions and have set up your Hugging Face token correctly.

Usage:
Ensure you have a 'data.json' file in the same directory as this script, then run:
python script_name.py

Requirements:
- transformers
- datasets
- torch (PyTorch)
- Sufficient GPU memory to load and fine-tune the Mistral-7B model
"""

import json
from datasets import Dataset
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer
)

batch_size = 1024 # Adjust this based on your memory capacity

# Load data from JSON file
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        items = json.load(f)
        for item in items:
            text = f"Instruction: {item['instruction']}\nResponse: {item['response']}"
            data.append(text)
    return data

# Prepare dataset
def prepare_dataset(data):
    return Dataset.from_dict({"text": data})

def tokenize_data(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# Load model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3", use_auth_token="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")
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", use_auth_token="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")

# Load and prepare data
print("--> prepare the data")
data = load_data("data.json")
dataset = prepare_dataset(data)
tokenized_dataset = dataset.map(tokenize_data, batched=True, batch_size=batch_size, num_proc=4)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs', # directory for storing logs
    logging_steps=200, # log every 200 steps
)

# Initialize Trainer
print("--> init training")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("--> start training")
# Start training
trainer.train()

# Save the model
print("--> saving the model")
trainer.save_model("./mistralai7B")

In [None]:
"""
Falcon-7B Language Model Fine-Tuning Script

This script fine-tunes the Falcon-7B language model on a custom dataset. It uses the Hugging Face 
Transformers library to load the pre-trained model and tokenizer, prepare the dataset, and perform 
the fine-tuning process.

Key features:
1. Loads and prepares data from a JSON file
2. Uses the Falcon-7B model and tokenizer
3. Tokenizes the dataset for training
4. Configures training arguments
5. Performs fine-tuning using the Hugging Face Trainer
6. Saves the fine-tuned model
7. Supports CPU and MPS devices

The script is designed to work with instruction-response pairs stored in a JSON file. It formats 
the data, tokenizes it, and then fine-tunes the model on this dataset.

Usage:
Ensure you have a 'data.json' file in the same directory as this script, then run:
python script_name.py

Requirements:
- transformers
- datasets
- torch (PyTorch)
- Sufficient GPU memory to load and fine-tune the Falcon-7B model
"""

import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

# Function to load data from JSON file
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Function to prepare dataset
def prepare_dataset(data):
    texts = [f"Instruction: {item['instruction']}\nResponse: {item['response']}" for item in data]
    return Dataset.from_dict({"text": texts})

# Function to tokenize data
def tokenize_data(examples, tokenizer):
    model_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Initialize tokenizer and model
def initialize_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="eager")
    model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

# Function to define the training arguments
def define_training_args(output_dir="./results"):
    return TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        save_steps=10_000,
        save_total_limit=2,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        use_mps_device=True,
    )

# Function to define the Trainer
def create_trainer(model, args, train_dataset, tokenizer):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

# Function to fine-tune and train the model
def train_model(trainer, output_dir):
    trainer.train()
    trainer.save_model(output_dir)

# Main function to orchestrate the training process
def main():
    # Set the device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model and tokenizer
    model_name = "ybelkada/falcon-7b-sharded-bf16"
    tokenizer, model = initialize_model_and_tokenizer(model_name)

    # Move model to the appropriate device
    model.to(device)

    # Load and prepare data
    data = load_data("data.json")
    dataset = prepare_dataset(data)
    tokenized_dataset = dataset.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=True,
        num_proc=4,
        remove_columns=dataset.column_names
    )

    # Define training arguments
    training_args = define_training_args()

    # Create Trainer
    trainer = create_trainer(model, training_args, tokenized_dataset, tokenizer)

    # Train model
    output_dir = "./falcon7B"
    train_model(trainer, output_dir)

if __name__ == "__main__":
    main()

In [None]:
def generate_text(prompts, model, tokenizer, device, temperature=0.7, max_length=200):
    # Set the model to evaluation mode
    model.eval()
    # Initialize an empty list to store the generated texts
    generated_texts = []

    # Disable gradient calculations for inference
    with torch.no_grad():
        # Iterate over each prompt in the input prompts list
        for prompt in tqdm(prompts, desc="Generating texts"):
            # Tokenize the prompt and move the input tensors to the specified device
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            # Generate text using the model with the specified max_length and temperature
            output = model.generate(**inputs, max_length=max_length, temperature=temperature)
            # Decode the generated tokens to a string, skipping special tokens
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
            # Append the generated text to the list of generated texts
            generated_texts.append(generated_text)
    
    # Return the list of generated texts
    return generated_texts

# Example usage of the generate_text function
prompts = ["create a complete pegasus workflow using python for machine learning"]
# Call the function with the prompts, model, tokenizer, and device
generated_texts = generate_text(prompts, model, tokenizer, device)

# Print each generated text
for text in generated_texts:
    print(text)


In [None]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

# Specify the paths to the saved model and tokenizer
model_path = "./mistralai7B"
# tokenizer_path = "pthornton614/CodeLama-7b-Instruct"  # Alternate tokenizer path, commented out
tokenizer_path = "mistralai/Mistral-7B-v0.3"  # Tokenizer path used during fine-tuning

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)

# Set the padding token if it is not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded.")

# Load the model
print("Loading model...")
# Check if the Apple Silicon (MPS) backend is available, otherwise use CPU
device = "mps" if torch.backends.mps.is_available() else "cpu"
# Load the model and move it to the specified device
model = LlamaForCausalLM.from_pretrained(model_path).to(device)
print("Model loaded.")


In [22]:
import torch
from tqdm import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM

# Specify the paths to the saved model and tokenizer
model_path = "./mistralai7B"
tokenizer_path = "mistralai/Mistral-7B-v0.3"  # Same as the path used during fine-tuning

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)

# Set the padding token if it is not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded.")

# Load the model
print("Loading model...")
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = LlamaForCausalLM.from_pretrained(model_path).to(device)
print("Model loaded.")

# Prepare a prompt
slurm_script = """#!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}
rm -rf ${SCRATCH_DIRECTORY}

# Happy end
exit 0
"""

prompt = f"Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. {slurm_script}\nResponse:"

# Tokenize the prompt and provide attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1000)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

# Generate text
print("Generating text...")
with torch.no_grad():
    for _ in tqdm(range(1), desc="Generating sequences"):
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=1000,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            top_k=50,
            do_sample=True
        )

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)


Generating text...
Generating text...


Generating sequences: 100%|██████████| 1/1 [01:12<00:00, 72.09s/it]

Generated text:
Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. #!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}





In [26]:
prompt = f"""Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. 
Your response should be a complete Python script that includes:
1. Necessary imports from Pegasus.api
2. Workflow creation
3. Job definitions that replicate the SLURM script functionality
4. Transformation, Site, and Replica catalogs as needed
5. Writing the workflow to a file

SLURM script:
{slurm_script}

Begin your response with 'from Pegasus.api import *' and end it with 'workflow.write()'.
Do not repeat code unnecessarily. Ensure each part of the SLURM script functionality is addressed only once in the Pegasus workflow.

Response:
"""
outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=1000,  # Increase max length for longer outputs
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,  # Enable sampling
    temperature=0.2,  # Adjust temperature (lower for more focused outputs)
    top_k=50,  # Limit to top k tokens
    top_p=0.95,  # Nucleus sampling
    no_repeat_ngram_size=3,  # Prevent repetition of 3-grams
    early_stopping=True,
    repetition_penalty=1.2  # Penalize repetition
)

In [27]:
#best output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)

Generated text:
Instruction: Convert the following SLURM script into a Pegasus workflow using the Pegasus.api Python package. The workflow should replicate the functionality of the SLURM script as closely as possible, including file handling, job execution, and cleanup. #!/bin/bash
#SBATCH --job-name=array_job
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err
#SBATCH --array=0-9

# Define and create a unique scratch directory
SCRATCH_DIRECTORY=/global/work/${USER}/job-array-example/${SLURM_JOBID}
mkdir -p ${SCRATCH_DIRECTORY}
cd ${SCRATCH_DIRECTORY}

cp ${SLURM_SUBMIT_DIR}/test.py ${SCRATCH_DIRECTORY}

# Each job will see a different ${SLURM_ARRAY_TASK_ID}
echo "now processing task id:: ${SLURM_ARRAY_TASK_ID}"
python test.py > output_${SLURM_ARRAY_TASK_ID}.txt

# After the job is done we copy our output back to ${SLURM_SUBMIT_DIR}
cp output_${SLURM_ARRAY_TASK_ID}.txt ${SLURM_SUBMIT_DIR}

# We step out of the scratch directory and remove it
cd ${SLURM_SUBMIT_DIR}
