In [1]:
# Step 1: Setup and Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# Check device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

# Step 2: Prepare Document
document = """Cheese is a type of dairy product produced in a range of flavors, textures, and forms by coagulation of the milk protein casein. It comprises proteins and fat from milk (usually the milk of cows, buffalo, goats or sheep). During production, milk is usually acidified and either the enzymes of rennet or bacterial enzymes with similar activity are added to cause the casein to coagulate. The solid curds are then separated from the liquid whey and pressed into finished cheese. Some cheeses have aromatic molds on the rind, the outer layer, or throughout."""

# Split document
split_idx = len(document) // 2
first_half = document[:split_idx]
second_half = document[split_idx:]

print("First half:", first_half)
print("\nSecond half:", second_half)

  Referenced from: <5AA8DD3D-A2CC-31CA-8060-88B4E9C18B09> /Users/vivekvajipey/miniconda3/envs/reasoning/lib/python3.10/site-packages/torchvision/image.so
  warn(


Using device: mps
First half: Cheese is a type of dairy product produced in a range of flavors, textures, and forms by coagulation of the milk protein casein. It comprises proteins and fat from milk (usually the milk of cows, buffalo, goats or sheep). During production, milk is usually acidified and either

Second half:  the enzymes of rennet or bacterial enzymes with similar activity are added to cause the casein to coagulate. The solid curds are then separated from the liquid whey and pressed into finished cheese. Some cheeses have aromatic molds on the rind, the outer layer, or throughout.


In [2]:
# Step 3: Initialize Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-hf"

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Set model dtype based on device
dtype = torch.float16 if device.type == "cuda" else torch.float32

# Initialize model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=dtype,
    device_map="auto"
)

print("Model and tokenizer initialized")

# Step 4: Configure and Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer initialized
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [3]:
# Step 5: Prepare Training Data
# Tokenize second half (setting max_length to avoid truncation warning)
second_half_tokens = tokenizer(second_half, return_tensors="pt", max_length=512)
token_ids = second_half_tokens["input_ids"]

# Create single start token tensor
bos = torch.tensor([[tokenizer.bos_token_id]], device=device)

# Prepare input: [start_token, token1, ..., token(n-1)]
input_ids = torch.cat([
    bos,
    token_ids[:, :-1]
], dim=1).to(device)

# Labels: [token1, ..., token(n)]
labels = token_ids.to(device)

print("\nShapes:")
print("Input shape:", input_ids.shape)
print("Labels shape:", labels.shape)
print("\nTokens:")
print("First few input tokens:", tokenizer.convert_ids_to_tokens(input_ids[0])[:10])
print("First few label tokens:", tokenizer.convert_ids_to_tokens(labels[0])[:10])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


RuntimeError: torch.cat(): all input tensors must be on the same device. Received mps:0 and cpu

once again

Using device: mps
First half: Cheese is a type of dairy product produced in a range of flavors, textures, and forms by coagulation of the milk protein casein. It comprises proteins and fat from milk (usually the milk of cows, buffalo, goats or sheep). During production, milk is usually acidified and either

Second half:  the enzymes of rennet or bacterial enzymes with similar activity are added to cause the casein to coagulate. The solid curds are then separated from the liquid whey and pressed into finished cheese. Some cheeses have aromatic molds on the rind, the outer layer, or throughout.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Model and tokenizer initialized
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


RuntimeError: torch.cat(): all input tensors must be on the same device. Received mps:0 and cpu