In [None]:
# Validate environment is ready

import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

import transformers
print(f"Transformers: {transformers.__version__}")

from sklearn.ensemble import RandomForestRegressor
print("âœ… Scikit-learn OK")

from datasets import load_dataset
dataset = load_dataset("flytech/python-codes-25k", split="train[:10]")
print("âœ… Datasets OK")

print (" Environment Ready!!!!")


In [None]:
# Load model and inital training Iteration 1

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()
import gc

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

# Smaller model + dataset
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
dataset = load_dataset("flytech/python-codes-25k", split="train[:200]")  # Direct split

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model CRITICALLY with gradients enabled
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# Enable training mode + gradients
model.train()
model.enable_input_require_grads()
for param in model.parameters():
    param.requires_grad = True

# Apply LoRA AFTER enabling gradients
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    # DeepSeek-V2 specific (MoE + MLA layers)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Verify trainable params

# tokenization (dynamic padding)
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(texts, truncation=True, max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Ultra-safe TrainingArguments
training_args = TrainingArguments(
    output_dir="./results_qwen",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    fp16=True,
    max_steps=50,  # Very small for testing
    logging_steps=5,
    dataloader_num_workers=0,
    report_to=None,
    optim="adamw_torch",
    learning_rate=2e-4
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=None  # Let trainer handle
)

print("Starting training...")
trainer.train()
print("Training completed!")


In [None]:
# Training Iteration 2
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()
import gc

from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

# Production model + smaller dataset for testing
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
dataset = load_dataset("flytech/python-codes-25k", split="train[:1000]")  # Reduced for testing

# Load tokenizer FIRST
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 4-bit quantization + device_map + offload
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

cuda_device = torch.cuda.current_device()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # CRITICAL: 4-bit quantization
    device_map={"": cuda_device},  # Force same GPU device for all layers
    offload_folder="offload",        # CRITICAL: CPU offload for excess layers
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# Enable gradient checkpointing BEFORE LoRA
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

#Freeze base model parameters
for param in model.parameters():
    param.requires_grad = False

# Apply LoRA (only ~1% params trainable)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                             # Reduced from 16
    lora_alpha=16,                   # Reduced
    lora_dropout=0.05,               # Reduced
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)
model.base_model.enable_input_require_grads()
model.print_trainable_parameters()

# Tokenization (shorter max_length)
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(
        texts, 
        truncation=True, 
        padding=False,               # Let collator handle padding
        max_length=256,              # Increased slightly but safe
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Ultra-conservative TrainingArguments
training_args = TrainingArguments(
    output_dir="./qwen-coder-finetuned",
    num_train_epochs=3,              # Reduced for testing
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,   # Reduced from 8
    optim="adamw_torch",
    learning_rate=1e-4,              # Reduced
    fp16=True,
    logging_steps=50,                 # More frequent logging
    max_steps=2000,
    save_steps=550,
    eval_strategy="no",
    warmup_steps=50,                 # Reduced
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    report_to=None,
    dataloader_num_workers=0,
    group_by_length=True,
    max_grad_norm=0.3,               # Gradient clipping
    dataloader_drop_last=True        # Drop incomplete batches
)

# Clear any remaining memory
gc.collect()
torch.cuda.empty_cache()

# Use proper data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("ðŸš€ Starting training... Monitor: watch nvidia-smi -l 1")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator 
)

trainer.train()

# Save final model
trainer.save_model("./my-python-code-llm-final_qwen")
print("âœ… Training completed! Model saved.")

In [None]:
# Inference test

import torch
import gc
import os
from peft import PeftModel, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
from peft import prepare_model_for_kbit_training

# Clear GPU cache and kill lingering processes
torch.cuda.empty_cache()
gc.collect()
torch.cuda.synchronize()

# Set memory allocator for fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 4-bit quantization (replaces torch_dtype=device_map)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # âœ… ~1.5GB vs 3GB
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# CPU-first load, then GPU move (avoids OOM during quantization)
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-1.5B-Instruct",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True
)
base_model = base_model.to("cuda")


# Prepare model for PEFT (gradient checkpointing)
base_model = prepare_model_for_kbit_training(base_model)

# Original PeftModel line
model = PeftModel.from_pretrained(base_model, "./my-python-code-llm-final_qwen")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def generate_code(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_ids = generated_ids[0][model_inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    return response

# Test
print(generate_code("Write a Python function to reverse a string leveraging the pandas library"))

In [None]:
# Clear Nvidia cache, garbage, Free up GTX VRAM

import torch
import gc
torch.cuda.empty_cache()
gc.collect()
torch.cuda.synchronize()
print(f"VRAM Free: {torch.cuda.memory_reserved()/1e9:.1f}GB")  # Should show >4GB free


In [None]:
# Check Nvidia usage
import os
os.system("nvidia-smi")


In [None]:
# Test 2

import time
start = time.time()
print(generate_code("Write a Python function to calculate fibonacci sequence"))
print(f"Generated in {time.time()-start:.1f}s")


In [None]:
# Gradio front end for local network access

import gradio as gr

frontEnd = gr.Interface(
    fn=generate_code,
    inputs=gr.Textbox(label="Input", lines=3),
    outputs=gr.Textbox(label="Generated Code", lines=10, scale=1),
    title="Python SLM"
)
frontEnd.launch(server_name="0.0.0.0", share=True)

In [None]:
# Training Iteration 3 - Resume from previous LoRA model
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()
import gc
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, 
    DataCollatorForLanguageModeling, BitsAndBytesConfig
)
from peft import PeftModel

# Same base model and quantization
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
previous_model_path = "./my-python-code-llm-final_qwen"

# Use flytech/python-codes-25k (works reliably, no script issues)
print("Loading Python code dataset...")
dataset = load_dataset("flytech/python-codes-25k", split="train[:5000]")  # Back to your working dataset!

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#  Proper tokenization for python-codes-25k (uses 'text' field)
def tokenize_function(examples):
    texts = examples['text']  # flytech dataset uses 'text' field with Python code
    tokenized = tokenizer(texts, truncation=True, padding=False, max_length=512, return_tensors=None)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Same quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

# Load BASE model (quantized)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, quantization_config=bnb_config,
    device_map="auto", offload_folder="offload",
    trust_remote_code=True, torch_dtype=torch.float16
)

# Load your PREVIOUS LoRA adapters on top
model = PeftModel.from_pretrained(base_model, previous_model_path)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

model.print_trainable_parameters()

# Conservative resume training args
training_args = TrainingArguments(
    output_dir="./qwen-coder-finetuned-v2",
    num_train_epochs=2, per_device_train_batch_size=1,
    gradient_accumulation_steps=8, optim="adamw_torch",
    learning_rate=5e-5,
    fp16=True, logging_steps=25, max_steps=3000,
    save_steps=750, eval_strategy="no",
    warmup_steps=25, dataloader_pin_memory=False,
    gradient_checkpointing=True, remove_unused_columns=False,
    report_to=None, dataloader_num_workers=0,
    group_by_length=True, max_grad_norm=0.3,
    dataloader_drop_last=True, load_best_model_at_end=False
)

gc.collect()
torch.cuda.empty_cache()

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, padding=True)

print("ðŸš€ Resuming training from previous LoRA adapters...")
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer, data_collator=data_collator
)

trainer.train()
trainer.save_model("./my-python-code-llm-v2_final")
print("âœ… Iteration 3 complete!")