In [26]:
pip install torch transformers transformers[torch] datasets evaluate

Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
from datasets import Dataset
import pandas as pd

### Use Hugging Face to download a pre-trained code model

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "microsoft/CodeGPT-small-py"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Import dataset  to train and evaluate the later finetuned model

In [3]:
import json

# Path to your JSONL file
file_path = "./HumanEval.jsonl"

# Read and parse the JSONL file
with open(file_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Display the first example
print(data[0])


{'task_id': 'HumanEval/0', 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n', 'entry_point': 'has_close_elements', 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", 'la

In [4]:
from datasets import Dataset
import pandas as pd

# Create the Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(data))

### Preprocess the Dataset to tokenize and format it for the model

Tokenize the Dataset

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Split into Train and Evaluation sets

In [6]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2) # 20% for testing, 80% for training
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

### Fine tune the model
set up a training loop using PyTorch

In [7]:
import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader

class CodeDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]  # Get the row correctly
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["input_ids"], dtype=torch.long)  # Labels should match input_ids for causal LM
        }

# Convert the Hugging Face dataset to PyTorch dataset
train_dataset = CodeDataset(train_dataset)
eval_dataset = CodeDataset(eval_dataset)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=2)


Define the Training Loop

In [None]:
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Set training parameters
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)  # Shifted labels for causal LM

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

print("Fine-tuning complete!")


Epoch 1/3


After training, save the model

In [None]:
model.save_pretrained("./fine_tuned_CodeGPT")
tokenizer.save_pretrained("./fine_tuned_CodeGPT")

### Benchmark and Evaluate the Model
After fine-tuning, evaluate using the test dataset

In [None]:
model.eval()
total_eval_loss = 0

with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_eval_loss += loss.item()

avg_eval_loss = total_eval_loss / len(eval_dataloader)
print(f"Evaluation Loss: {avg_eval_loss:.4f}")

### Test the fine-Tuned model

In [None]:
def generate_code(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generate_code("def fibonacci(n):"))
