In [1]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy==1.26.4

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

print("✅ pandas:", pd.__version__)
print("✅ numpy:", np.__version__)


✅ pandas: 2.2.3
✅ numpy: 1.26.4


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
import torch

  from .autonotebook import tqdm as notebook_tqdm
2025-05-15 08:23:28.532313: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 08:23:28.822264: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747297408.925546    3532 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747297408.959749    3532 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747297409.207161    3532 computation_placer.cc:177] computation placer already r

In [5]:
# Load and split dataset
data_path = "data/final_training_data.jsonl"
raw_dataset = load_dataset("json", data_files=data_path, split="train")
split_dataset = raw_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [6]:
# Load model and tokenizer
model_name = "bigcode/starcoder2-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]


In [7]:
# Use EOS token as pad token (required for training)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [8]:
# LoRA configuration (minimal footprint)
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

In [9]:
# Tokenize function
def tokenize(example):
    return tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_train_dataset = train_dataset.map(tokenize, remove_columns=["prompt"])
tokenized_eval_dataset = eval_dataset.map(tokenize, remove_columns=["prompt"])

In [10]:
# Data collator
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
import transformers
print(transformers.__version__)

4.51.3


In [12]:
import transformers
print(transformers.__version__)
from transformers import TrainingArguments
print(TrainingArguments.__module__)
print(dir(TrainingArguments))


4.51.3
transformers.training_args
['_VALID_DICT_FIELDS', '__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_dict_torch_dtype_to_str', '_n_gpu', '_no_sync_in_gradient_accumulation', '_setup_devices', 'accelerator_config', 'adafactor', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'auto_find_batch_size', 'average_tokens_across_devices', 'batch_eval_metrics', 'bf16', 'bf16_full_eval', 'data_seed', 'dataloader_drop_last', 'dataloader_num_workers', 'dataloader_persistent_workers', 'dataloader_pin_memory', 'dataloader_prefetch_factor', 'ddp_backend', 'ddp_broadcast_buffers', 'ddp_bucket_cap_mb', 'ddp_fin

In [13]:
from transformers import TrainingArguments

print(hasattr(TrainingArguments, 'evaluation_strategy'))
print(hasattr(TrainingArguments, 'eval_strategy'))


False
True


In [14]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


transformers.training_args


In [15]:
import sys
print(sys.executable)

/root/ds677/myenv/bin/python


In [19]:
# TrainingArguments
from transformers import TrainingArguments
output_dir="output/starcoder3b"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    logging_dir="logs",
    logging_steps=50,           # log every 50 steps
    eval_strategy="epoch",      # evaluate every epoch
    logging_strategy="steps",
    report_to="tensorboard",    # enable TensorBoard
    # max_steps=200,
    learning_rate=3e-4,
    bf16=True,
    fp16=False,
    save_strategy="no"
)

In [20]:
# Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# tensorboard --logdir logs

In [None]:
# Save final model
trainer.save_model(f"{output_dir}/final")
tokenizer.save_pretrained(f"{output_dir}/final")
print("✅ LoRA fine-tuning of StarCoder 3B complete")

✅ LoRA fine-tuning of StarCoder 3B complete


In [2]:
!zip -r starcoder3b_lora_trained.zip output/starcoder3b/final

  adding: output/starcoder3b/final/ (stored 0%)
  adding: output/starcoder3b/final/adapter_config.json (deflated 54%)
  adding: output/starcoder3b/final/adapter_model.safetensors

 (deflated 7%)
  adding: output/starcoder3b/final/vocab.json (deflated 57%)
  adding: output/starcoder3b/final/tokenizer_config.json (deflated 90%)
  adding: output/starcoder3b/final/README.md (deflated 66%)
  adding: output/starcoder3b/final/training_args.bin (deflated 53%)
  adding: output/starcoder3b/final/special_tokens_map.json (deflated 72%)
  adding: output/starcoder3b/final/merges.txt (deflated 51%)
  adding: output/starcoder3b/final/tokenizer.json (deflated 81%)


In [4]:
# Inference: Run prompts on your fine-tuned StarCoder 3B (LoRA)

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_path = "output/starcoder3b/final"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda")

# Create pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# Test prompt
prompt = """
**Instruction:**
Implement the `get` method to retrieve a `RepositoryDescriptionInfo` based on an integer input. This method accepts an integer parameter `value` and returns a `RepositoryDescriptionInfo` object. Ensure that the method handles only specific values: `CLASS_VALUE` returns `CLASS`, and `IS_VALUE` returns `IS",
**Completion:**
```java
"""
output = generator(
    prompt,
    max_new_tokens=200,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False
)[0]["generated_text"]

print("\n=== Generated Code ===\n")
print(output)


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 42.28it/s]
Device set to use cuda:0



=== Generated Code ===

public RepositoryDescriptionInfo get(int value) {
    switch (value) {
        case CLASS_VALUE:
            return CLASS;
        case IS_VALUE:
            return IS;
        default:
            return null;
    }
}
```
**Key Points:**
- The method should return a `RepositoryDescriptionInfo` object based on the provided `value`.
- Ensure that the method handles only specific values (`CLASS_VALUE` and `IS_VALUE`) and returns `null` for any other input.

**Notes:**
- The method should be part of a class that manages repository descriptions.

**References:**
- [RepositoryDescriptionInfo](https://github.com/apache/maven-shared-utils/blob/master/src/main/java/org/apache/maven/shared/utils/RepositoryDescriptionInfo.java)

**Related Topics:**
- [Switch Statements](https://docs.oracle.com/javase


In [5]:
def truncate_after_first_code_block(text):
    end = text.find("```", text.find("```") + 3)  # second triple backtick
    return text[:end+3] if end != -1 else text

raw = generator(prompt, max_new_tokens=1000, do_sample=False)[0]["generated_text"]
print(truncate_after_first_code_block(raw))



**Instruction:**
Implement the `get` method to retrieve a `RepositoryDescriptionInfo` based on an integer input. This method accepts an integer parameter `value` and returns a `RepositoryDescriptionInfo` object. Ensure that the method handles only specific values: `CLASS_VALUE` returns `CLASS`, and `IS_VALUE` returns `IS",
**Completion:**
```java
public RepositoryDescriptionInfo get(int value) {
    switch (value) {
        case CLASS_VALUE:
            return CLASS;
        case IS_VALUE:
            return IS;
        default:
            return null;
    }
}
```
