<a href="https://colab.research.google.com/github/ubiodee/Plutus_Demo/blob/main/Cardano_Plutus1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets accelerate sentencepiece huggingface_hub bitsandbytes

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch
from huggingface_hub import login

# Login to Hugging Face (replace with your token securely)
login(token="")

# Load dataset from provided JSON file
dataset = load_dataset("json", data_files={"train": "/content/Plutus_Ubio_Cleansed.json"})

# Model name
model_name = "meta-llama/Llama-3.2-7B"

# Load model with bfloat16 (better stability than float16)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",  # Automatically assigns model to available GPU
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Format dataset for instruction tuning
def format_data(example):
    """Formats dataset for LLM fine-tuning with structured input/output."""
    problem = example.get("Problem", "No problem provided")
    solution = example.get("Solution", "No solution provided")

    example["text"] = (
        "### Instruction:\n"
        "You are an AI Plutus assistant. Help developers write and debug smart contracts.\n"
        "Provide well-structured, correct, and complete Plutus solutions.\n\n"
        f"### Problem:\n{problem}\n\n"
        f"### Solution:\n{solution}"
    )
    return example

# Apply formatting
dataset = dataset["train"].map(format_data)

# Tokenization function
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    tokenized["labels"][tokenized["labels"] == tokenizer.pad_token_id] = -100  # Ignore padding in loss
    return tokenized

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format("torch")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Adjust batch size based on Colab GPU
    gradient_accumulation_steps=4,  # Accumulate gradients to handle small batch sizes
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",  # Disable Weights & Biases logging
    push_to_hub=True,
    hub_model_id="ubiodee/Cardano_plutus",
    weight_decay=0.01,
    max_grad_norm=1.0,
    bf16=True,  # Enables stable mixed precision training
    gradient_checkpointing=True,  # Reduces memory usage
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Start training
trainer.train()

# Save tokenizer locally
tokenizer.save_pretrained("./tokenizer")

# Push tokenizer and model to Hugging Face
tokenizer.push_to_hub("ubiodee/Cardano_plutus")
trainer.push_to_hub()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.0607
20,0.7664
30,0.6126
40,0.4488
50,0.4815
60,0.4041
70,0.4543
80,0.4162
90,0.2375
100,0.2157


README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ubiodee/Cardano_plutus/commit/2b4a4b44cf3ab019571e170dc2499e081d2acac1', commit_message='End of training', commit_description='', oid='2b4a4b44cf3ab019571e170dc2499e081d2acac1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ubiodee/Cardano_plutus', endpoint='https://huggingface.co', repo_type='model', repo_id='ubiodee/Cardano_plutus'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline
# In ipython-input-2-4a157b5a023a:
prompt = "Write a PlutusTx script that always succeeds and passes validation."

# Create a text generation pipeline using your fine-tuned model
pipe = pipeline(
    "text-generation",
    model="./results",  # Path to your fine-tuned model
    tokenizer=tokenizer,
    # ... other pipeline parameters if needed ...
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
