# Pretraining on .py files

In [2]:
import random
from git.repo import Repo
from pathlib import Path
from rich.markdown import Markdown

Let's clone vllm: 

```bash
git clone https://github.com/vllm-project/vllm.git
```


let's bring some text in

In [3]:
vllm_path = Path("vllm/")
vllm_repo = Repo(vllm_path)

In [4]:
# Get the latest commit
last_commit = vllm_repo.commit()

# Print the commit message
print(last_commit)

90979c38f87c17d53a7cd0eb430373ecb0b64b9a


Naively we can bring all MarkDown files

In [5]:
def find_files(directory, extension="*.py"):
    "Find all files of a given `extension` in a directory and return their content and path"
    md_files = []
    for file in Path(directory).rglob(extension):
        with open(file, 'r', encoding='utf-8') as md_file:
            content = md_file.read()
        md_files.append((file.relative_to(directory), content))
    return md_files

In [6]:
py_files = find_files(vllm_path)

file = random.choice(py_files)

len(py_files), print(file[1][0:500])

"""Compare the outputs of HF and vLLM when using greedy sampling.

Run `pytest tests/models/test_models.py --forked`.
"""
import pytest

MODELS = [
    "facebook/opt-125m",
    "gpt2",
    "bigcode/tiny_starcoder_py",
    "EleutherAI/gpt-j-6b",
    "EleutherAI/pythia-70m",
    "bigscience/bloom-560m",
    "mosaicml/mpt-7b",
    "tiiuae/falcon-7b",
    "meta-llama/Llama-2-7b-hf",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("ma


(96, None)

We should stack the file with some metadata, for instance, the path where the files is coming from.

In [7]:
def stuff(file):
    path, content = file
    return f"<<Begin file>>\nPath:\n{path}\n---------\nContent:\n{content}<<End File>>"

In [8]:
print(stuff(file))

<<Begin file>>
Path:
tests/models/test_models.py
---------
Content:
"""Compare the outputs of HF and vLLM when using greedy sampling.

Run `pytest tests/models/test_models.py --forked`.
"""
import pytest

MODELS = [
    "facebook/opt-125m",
    "gpt2",
    "bigcode/tiny_starcoder_py",
    "EleutherAI/gpt-j-6b",
    "EleutherAI/pythia-70m",
    "bigscience/bloom-560m",
    "mosaicml/mpt-7b",
    "tiiuae/falcon-7b",
    "meta-llama/Llama-2-7b-hf",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
) -> None:
    hf_model = hf_runner(model, dtype=dtype)
    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    del hf_model

    vllm_model = vllm_runner(model, dtype=dtype)
    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    del vllm_model

  

In [9]:
import json
from tqdm.auto import tqdm

with open('vllm_python.jsonl', 'w') as json_file:
    for file in tqdm(py_files):
        data = stuff(file)
        json.dump({"text":data}, json_file)
        json_file.write('\n')

  0%|          | 0/96 [00:00<?, ?it/s]

## How much data do we have?

In [10]:
from transformers import AutoTokenizer

In [11]:
OS_MODEL = "codellama/CodeLlama-7b-Python-hf"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(OS_MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
tokenizer.encode("def hello_world():\n\tprint('Hello World!')")

[1,
 822,
 22172,
 29918,
 11526,
 7295,
 13,
 12,
 2158,
 877,
 10994,
 2787,
 29991,
 1495]

Let's save each file on a separate line of a JSONL file

In [14]:
import json
def read_jsonl(fname):
    "Read a .jsonl file and return a list of dicts"
    with open(fname, 'r') as json_file:
        return [json.loads(line) for line in json_file]

In [15]:
data = read_jsonl("vllm_python.jsonl")

In [16]:
raw_corpus = "\n".join([d['text'] for d in data])

In [17]:
raw_corpus



In [18]:
tokenized_data = tokenizer.encode(raw_corpus)

In [19]:
tokens = len(tokenized_data)
print(f"VLLM .py Total tokens: {tokens/1_000_000}M")

VLLM .py Total tokens: 0.183235M


That's not a lot of tokens :P

## Save to W&B

In [20]:
import wandb

with wandb.init(project="vllm_llm"):
    at = wandb.Artifact(name="vllm_python", 
                        description="The .py files from the vllm library",
                        type="dataset",
                        metadata={
                            "url": "https://github.com/vllm-project/vllm.git",
                            "commit":last_commit,
                            "remote": vllm_repo.remote().url,
                            "tokens": tokens})
    at.add_file("vllm_python.jsonl")
    
    wandb.log_artifact(at)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




VBox(children=(Label(value='0.645 MB of 0.660 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.978074…