In [1]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

import transformers
print(f"Transformers: {transformers.__version__}")

from sklearn.ensemble import RandomForestRegressor
print("✅ Scikit-learn OK")

from datasets import load_dataset
dataset = load_dataset("flytech/python-codes-25k", split="train[:10]")
print("✅ Datasets OK")

print (" Environment Ready!!!!")


PyTorch: 2.5.1+cu121
CUDA: True
Transformers: 4.57.3
✅ Scikit-learn OK
✅ Datasets OK
 Environment Ready!!!!


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()
import gc

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

# Smaller model + dataset
#model_name = "microsoft/DialoGPT-small"
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
dataset = load_dataset("flytech/python-codes-25k", split="train[:200]")  # Direct split

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model CRITICALLY with gradients enabled
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# CRITICAL: Enable training mode + gradients
model.train()
model.enable_input_require_grads()
for param in model.parameters():
    param.requires_grad = True

# Apply LoRA AFTER enabling gradients
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    # DeepSeek-V2 specific (MoE + MLA layers)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Verify trainable params

# Fixed tokenization (dynamic padding)
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(texts, truncation=True, max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Ultra-safe TrainingArguments
training_args = TrainingArguments(
    output_dir="./results_qwen",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    fp16=True,
    max_steps=50,  # Very small for testing
    logging_steps=5,
    dataloader_num_workers=0,
    report_to=None,
    optim="adamw_torch",
    learning_rate=2e-4
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=None  # Let trainer handle
)

print("Starting training...")
trainer.train()
print("Training completed!")


2025-12-05 09:47:46.823576: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,1.3113
10,0.7731
15,0.6068
20,0.5492
25,0.5005
30,0.4226
35,0.3702
40,0.3377
45,0.2959
50,0.3045


Training completed!


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
import torch
torch.cuda.empty_cache()
import gc

from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

# Production model + smaller dataset for testing
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
dataset = load_dataset("flytech/python-codes-25k", split="train[:1000]")  # Reduced for testing

# FIXED: Load tokenizer FIRST
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# FIXED: 4-bit quantization + device_map + offload
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # CRITICAL: 4-bit quantization
    device_map="auto",               # CRITICAL: layer-wise device placement
    offload_folder="offload",        # CRITICAL: CPU offload for excess layers
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# FIXED: Enable gradient checkpointing BEFORE LoRA
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

# FIXED: Apply LoRA (only ~1% params trainable)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                             # Reduced from 16
    lora_alpha=16,                   # Reduced
    lora_dropout=0.05,               # Reduced
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# FIXED: Tokenization (shorter max_length)
def tokenize_function(examples):
    texts = [f"### Instruction: {inst}\n### Input: {inp}\n### Response: {out}" 
             for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    tokenized = tokenizer(
        texts, 
        truncation=True, 
        padding=False,               # Let collator handle padding
        max_length=256,              # Increased slightly but safe
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Ultra-conservative TrainingArguments
training_args = TrainingArguments(
    output_dir="./qwen-coder-finetuned",
    num_train_epochs=1,              # Reduced for testing
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,   # Reduced from 8
    optim="adamw_torch",
    learning_rate=1e-4,              # Reduced
    fp16=True,
    logging_steps=5,                 # More frequent logging
    save_steps=250,
    eval_strategy="no",
    warmup_steps=50,                 # Reduced
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    report_to=None,
    dataloader_num_workers=0,
    group_by_length=True,
    max_grad_norm=0.3,               # Gradient clipping
    dataloader_drop_last=True        # Drop incomplete batches
)

# Clear any remaining memory
gc.collect()
torch.cuda.empty_cache()

# Use proper data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
# Inference test

# Load your trained model
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct", torch_dtype=torch.float16)
model = PeftModel.from_pretrained(base_model, "./my-python-code-llm-final_qwen")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")

def generate_code(prompt):
    inputs = tokenizer(f"### Instruction: {prompt}\n### Response:", return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test it!
print(generate_code("Write a Python function to reverse a string leveraging the pandas library"))