In [None]:
# pip installs

!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece wandb matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata, files
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# Constants
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "code-translator"
HF_USER = "wu7115" # your HF username

MAX_SEQUENCE_LENGTH = 512 # Longer because code snippets can be long
KEEP_BUFFER = 10

RUN_NAME = f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Hyperparameters
LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

EPOCHS = 1
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

STEPS = 50
SAVE_STEPS = 5000
LOG_TO_WANDB = True

%matplotlib inline

In [None]:
# Log in
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Log in to Weights & Biases
wandb_api_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

[34m[1mwandb[0m: Currently logged in as: [33mwu7115[0m ([33mwu7115-uci[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# --- Dataset Processing ---

# Paths
train_cpp_path = 'train-C++-Python-tok.cpp'
train_py_path = 'train-C++-Python-tok.py'
val_cpp_path = 'val-C++-Python-tok.cpp'
val_py_path = 'val-C++-Python-tok.py'

# Upload manually (left panel → Upload Files) or programmatically
# uploaded = files.upload()

def clean_python_code(py_code):
    # Remove INDENT and DEDENT
    py_code = py_code.replace('INDENT', '').replace('DEDENT', '')
    # Remove multiple empty lines into just one
    py_code = re.sub(r'\n\s*\n', '\n', py_code)
    # Remove leading and trailing spaces per line
    py_code = '\n'.join(line.strip() for line in py_code.split('\n'))
    return py_code

# def load_code_pair(cpp_path, py_path):
#     with open(cpp_path, 'r') as f_cpp, open(py_path, 'r') as f_py:
#         cpp_lines = f_cpp.readlines()
#         py_lines = f_py.readlines()

#     dataset = []
#     for cpp_code, py_code in zip(cpp_lines, py_lines):
#         cpp_code = cpp_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')
#         py_code = py_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')

#         prompt = f"Translate the following C++ code into Python:\n\n{cpp_code}\n\nAnswer: {py_code}"
#         dataset.append({"text": prompt})

#     return Dataset.from_list(dataset)

def load_code_pair(cpp_path, py_path):
    with open(cpp_path, 'r') as f_cpp, open(py_path, 'r') as f_py:
        cpp_lines = f_cpp.readlines()
        py_lines = f_py.readlines()

    dataset = []
    for cpp_code, py_code in zip(cpp_lines, py_lines):
        cpp_code = cpp_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')
        py_code = py_code.replace('NEW_LINE', '\n').replace('STRNEWLINE', '\n')

        # Clean Python code before putting into prompt
        py_code = clean_python_code(py_code)

        prompt = f"Translate the following C++ code into Python:\n\n{cpp_code}\n\nAnswer: {py_code}"
        dataset.append({"text": prompt})

    return Dataset.from_list(dataset)

In [None]:
# Create train and val datasets
train = load_code_pair(train_cpp_path, train_py_path)
val = load_code_pair(val_cpp_path, val_py_path)

In [None]:
print(train[1000])
print(val[1000])

{'text': 'Translate the following C++ code into Python:\n\nint main ( ) { long long int A = 7 , B = 15 ; cout << minOperations ( A , B ) << endl ; return 0 ; }\n\n\nAnswer: A = 7\nB = 15\nprint ( minOperations ( A , B ) )\n'}
{'text': 'Translate the following C++ code into Python:\n\nint pos = i ; for ( int j = i + 1 ; j < n ; ++ j ) {\n\n\nAnswer: pos = i\nfor j in range ( i + 1 , n ) :\n'}


In [None]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
# pick the right quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [None]:
# Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.gradient_checkpointing_enable()
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
from peft import prepare_model_for_kbit_training
base_model = prepare_model_for_kbit_training(base_model)

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Memory footprint: 7693.4 MB


In [None]:
def filter_long(example):
    # The full prompt is already in example["text"]
    length = len(tokenizer.encode(example["text"], add_special_tokens=False))
    return length + KEEP_BUFFER < MAX_SEQUENCE_LENGTH

train = train.filter(filter_long, num_proc=4)
val  = val .filter(filter_long, num_proc=4)

train = train.shuffle(seed=42).select(range(len(train) // 4))
val = val.shuffle(seed=42).select(range(len(val) // 4))

print(f"Training set size after filter: {len(train):,}")

def max_len(ds):
    return max(len(tokenizer.encode(x["text"], add_special_tokens=False)) for x in ds)

print("Longest train sample:", max_len(train))
print("Longest eval  sample:", max_len(val))

Filter (num_proc=4):   0%|          | 0/80100 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/7228 [00:00<?, ? examples/s]

Training set size after filter: 20,007
Longest train sample: 497
Longest eval  sample: 380


In [None]:
# Data Collator
from trl import DataCollatorForCompletionOnlyLM
response_template = "Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
# LoRA config
lora_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

# # After loading model and before fine_tuning:
# from peft import get_peft_model
# # Inject LoRA
# base_model = get_peft_model(base_model, lora_parameters)
# # Optional: print trainable parameters
# base_model.print_trainable_parameters()

# Trainer config
train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="steps",
    # evaluation_strategy="steps",
    eval_steps=STEPS,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=5,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train,
    eval_dataset=val,
    peft_config=lora_parameters,
    # tokenizer=tokenizer,
    args=train_parameters,
    data_collator=collator
)

Converting train dataset to ChatML:   0%|          | 0/20007 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/20007 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20007 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20007 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1806 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1806 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1806 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1806 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Fine-tune!
torch.cuda.empty_cache()
fine_tuning.train()

# Push our fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,0.3359,0.250438
100,0.2307,0.212331
150,0.1924,0.211455
200,0.2109,0.202965
250,0.1931,0.19993
300,0.2047,0.20128
350,0.2029,0.197848
400,0.172,0.195766
450,0.1927,0.189506
500,0.1978,0.192078


[34m[1mwandb[0m: Adding directory to artifact (./code-translator-2025-04-28_18.24.41/checkpoint-1251)... Done. 0.7s
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


README.md:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved to the hub: code-translator-2025-04-28_18.24.41


In [None]:
if LOG_TO_WANDB:
  wandb.finish()

0,1
eval/loss,█▅▄▄▃▄▃▃▂▃▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁
eval/mean_token_accuracy,▁▅▇▆▇▅▇▆▇▆█▇█▇████▇██████
eval/num_tokens,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
eval/runtime,▂▃▅▁▁▅▄▂▃▂▃▃▄▄▄▄▅▃█▅▅▃▄▅▅
eval/samples_per_second,▇▆▄██▄▅▇▆▇▆▆▅▅▅▅▄▆▁▄▄▆▅▄▄
eval/steps_per_second,▇▆▄██▄▅▇▆▇▆▆▅▅▅▅▄▆▁▄▄▆▅▄▄
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▃▄▂▅▇▅▃▂▂▃▃▄▂▂█▃▂▂▃▂▁▁▂▁▃
train/learning_rate,████▇▇▇▇▆▆▅▅▄▄▄▃▃▂▂▂▁▁▁▁▁

0,1
eval/loss,0.17294
eval/mean_token_accuracy,0.95605
eval/num_tokens,1250897.0
eval/runtime,250.3927
eval/samples_per_second,7.213
eval/steps_per_second,7.213
total_flos,6.063859975023821e+16
train/epoch,1.0
train/global_step,1251.0
train/grad_norm,0.87957
