<a href="https://colab.research.google.com/github/ubiodee/llama_Finetuning/blob/main/Model_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gc
from flask import Flask, render_template_string, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from peft import PeftModel, PeftConfig
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

# Set MPS memory limit
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Initialize Flask app
app = Flask(__name__)

# Define model checkpoint path (Local directory path for model)
model_dir = "./checkpoint-195"  # Path to your local model directory

# Find the latest checkpoint directory within the model directory
checkpoint_dirs = [d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and d.startswith("checkpoint-")]
latest_checkpoint_dir = max(checkpoint_dirs, key=lambda x: int(x.split('-')[1])) if checkpoint_dirs else None

#if latest_checkpoint_dir:
    #model_path = os.path.join(model_dir, latest_checkpoint_dir)
    #print(f"Found latest checkpoint: {model_dir}")
#else:
    #raise ValueError("No checkpoints found in the model directory.")

# Define tokenizer path
tokenizer_path = model_dir  # Use the same directory for the tokenizer

# Detect device (MPS or CPU)
if torch.backends.mps.is_available():
    device = "mps"
    print("Apple Silicon detected. Using MPS.")
else:
    device = "cpu"
    print("No GPU detected. Using CPU.")

def clear_memory():
    """Helper function to clear memory"""
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()

# Model loading configuration with memory optimizations
print("Loading model...")
try:
    # Clear memory before loading
    clear_memory()

    # Initialize the model using Hugging Face's from_pretrained method
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,  # Load from the local checkpoint
        torch_dtype=torch.float16 if device == "mps" else torch.float32,
        low_cpu_mem_usage=True,
    )

    print(f"Model loaded successfully. Device: {device}")
    clear_memory()

except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Load tokenizer
try:
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path,  # Load tokenizer from the local checkpoint directory
        trust_remote_code=True,
        use_fast=False
    )
    tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

# Create text generation pipeline with optimizations
try:
    print("Creating text generation pipeline...")
    pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    # max_new_tokens=512,  # Ensure this is set in the pipeline itself if you want to control generation length
    device_map="auto",
    torch_dtype=torch.float16 if device == "mps" else torch.float32,
)

except Exception as e:
    print(f"Error creating pipeline: {e}")
    raise

No GPU detected. Using CPU.
Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500,)
prompt = "Write a PlutusTx script to burn a token."
result = pipe(prompt)
print(result[0]['generated_text'])