In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Fits GTX 1660 Super
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
model.save_pretrained("./tinyllama")

2025-12-04 12:55:21.294230: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`torch_dtype` is deprecated! Use `dtype` instead!


In [2]:
from datasets import load_dataset
dataset = load_dataset("flytech/python-codes-25k")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})


In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

# Smaller model + dataset
model_name = "microsoft/DialoGPT-small"
dataset = load_dataset("flytech/python-codes-25k", split="train[:200]")  # Direct split

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model CRITICALLY with gradients enabled
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# CRITICAL: Enable training mode + gradients
model.train()
model.enable_input_require_grads()
for param in model.parameters():
    param.requires_grad = True

# Apply LoRA AFTER enabling gradients
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Verify trainable params

# Fixed tokenization (dynamic padding)
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(texts, truncation=True, max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Ultra-safe TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    fp16=True,
    max_steps=50,  # Very small for testing
    logging_steps=5,
    dataloader_num_workers=0,
    report_to=None,
    optim="adamw_torch",
    learning_rate=2e-4
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=None  # Let trainer handle
)

print("Starting training...")
trainer.train()
print("Training completed!")


2025-12-04 13:15:09.764873: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`torch_dtype` is deprecated! Use `dtype` instead!


trainable params: 405,504 || all params: 124,845,312 || trainable%: 0.3248




Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,20.5007
10,20.1909
15,19.6972
20,19.172
25,18.3136
30,16.8491
35,15.722
40,14.5437
45,13.8242
50,13.493


Training completed!


In [None]:
# Training - Iteration2

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

# Production model + larger dataset
model_name = "microsoft/DialoGPT-small"
dataset = load_dataset("flytech/python-codes-25k", split="train[:5000]")

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with gradients enabled
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# Enable training mode + gradients
model.train()
model.enable_input_require_grads()
for param in model.parameters():
    param.requires_grad = True

# Production LoRA config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# tokenization with padding=True
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(
        texts, 
        truncation=True, 
        padding=True, 
        max_length=128,
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# PRODUCTION TrainingArguments
training_args = TrainingArguments(
    output_dir="./python-code-llm",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=3,
    max_steps=2000,
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    dataloader_num_workers=0,
    report_to=None,
    optim="adamw_torch",
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    lr_scheduler_type="cosine"
)

# Use proper data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator 
)

print("Starting PRODUCTION training...")
print("Monitor with: watch nvidia-smi -l 1")
trainer.train()

# Save final model
trainer.save_model("./my-python-code-llm-final")
print("âœ… Training completed! Model saved to ./my-python-code-llm-final")


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475




Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting PRODUCTION training...
Monitor with: watch nvidia-smi -l 1


Step,Training Loss
50,19.4224
100,17.5023
150,9.1245
200,6.0264
250,4.9156
300,4.4112
350,4.1094
400,3.8945
450,3.7659
500,3.6488
