## Traing a causal language model for Python data science code generation

In [2]:
def any_keyword_in_string(string, keywords):
    return any(keyword in string for keyword in keywords)

In [3]:
filters = ["pandas", "polars"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"
print(any_keyword_in_string(example_1, filters))  # Should print False
print(any_keyword_in_string(example_2, filters))  # Should print True

False
True


In [4]:
# Even the sample data is rather large, so we will filter it down using streaming
from datasets import load_dataset, Dataset, DatasetDict

ds_train_stream = load_dataset(
    "huggingface-course/codeparrot-ds-train",
    split="train",
    streaming=True,
).take(20_000)
ds_val_stream = load_dataset(
    "huggingface-course/codeparrot-ds-valid",
    split="validation",
    streaming=True,
).take(5_000)

raw_datasets = DatasetDict({
    "train": Dataset.from_list(list(ds_train_stream)),
    "validation": Dataset.from_list(list(ds_val_stream)),
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [5]:
# Show a few examples
raw_datasets["train"][0]

{'repo_name': 'kmike/scikit-learn',
 'path': 'sklearn/utils/__init__.py',
 'copies': '3',
 'size': '10094',
 'license': 'bsd-3-clause'}

In [8]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}...\n")

REPO_NAME: kmike/scikit-learn...

PATH: sklearn/utils/__init__.py...

COPIES: 3...

SIZE: 10094...

CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm...

LICENSE: bsd-3-clause...



In [9]:
# Cut contents into chunks
from transformers import AutoTokenizer
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"]["content"][:2],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs lengths: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {(outputs['overflow_to_sample_mapping'])}")

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs lengths: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [11]:
for key in outputs:
    print(f"{key.upper()}: {outputs[key]}")

INPUT_IDS: [[280, 173, 2096, 310, 2025, 749, 24661, 14, 1377, 64, 1340, 6376, 8258, 4705, 2646, 14, 173, 280, 173, 173, 973, 4962, 978, 7187, 173, 173, 2745, 1601, 442, 635, 173, 973, 4985, 14, 6322, 978, 29638, 173, 2745, 3758, 173, 173, 973, 1428, 77, 26853, 250, 1817, 978, 288, 26853, 250, 1817, 19, 63, 1551, 173, 973, 1428, 6436, 978, 308, 352, 63, 1345, 63, 783, 12, 951, 63, 7194, 12, 4724, 63, 5293, 12, 373, 1387, 63, 536, 63, 7544, 12, 960, 18, 68, 12, 35001, 18, 68, 63, 228, 63, 22229, 12, 373, 35001, 18, 68, 63, 228, 63, 9422, 12, 6792, 63, 804, 63, 854, 63, 1345, 12, 373, 951, 63, 2437, 63, 977, 9, 173, 973, 1428, 692, 63, 2077, 978, 3115, 63], [692, 63, 2077, 173, 173, 612, 536, 612, 233, 2558, 77, 26853, 250, 1817, 19, 63, 1551, 485, 333, 352, 63, 1345, 63, 783, 485, 333, 1207, 63, 7194, 485, 333, 4048, 63, 5293, 485, 1366, 333, 5224, 63, 536, 63, 7544, 485, 333, 783, 18, 68, 485, 333, 11198, 18, 68, 63, 228, 63, 22229, 485, 1366, 333, 11198, 18, 68, 63, 228, 63, 9422, 485,

In [12]:
# Define a helper function to "map"
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

tokenized_datasets

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3322 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 557217
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 93164
    })
})

In [15]:
# Model config
from transformers import AutoConfig, AutoTokenizer, GPT2LMHeadModel

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=tokenizer.vocab_size,
    c_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Initialize the model
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1e6:.2f}M parameters")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Model size: 124.24M parameters


In [16]:
# Data collator
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a pad token by default
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [17]:
# Test it out
out = data_collator([tokenized_datasets["train"][i] for i in range(4)])
for key in out:
    print(f"{key.upper()}: {out[key].shape}")

INPUT_IDS: torch.Size([4, 128])
ATTENTION_MASK: torch.Size([4, 128])
LABELS: torch.Size([4, 128])


In [18]:
# Set up the trainer
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
    output_dir="../data/models/codeparrot-ds",
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    eval_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose "Don't visualize my results"


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


In [None]:
# Set up hf credentials
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi, create_repo

load_dotenv()
token = os.getenv("HF_TOKEN_WRITE")

trainer.push_to_hub()

In [None]:
# Test the model
import torch
from transformers import pipeline
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline(
    "text-generation",
    model="../data/models/codeparrot-ds",
    tokenizer=tokenizer,
    device=device,
)

txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""

print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
x = np.random.randn(100)
y = np.random.randn(100)
plt.scatter(x, y)