# Pretraining on .py files

In [4]:
import random
from git.repo import Repo
from pathlib import Path
from rich.markdown import Markdown

Let's clone vllm: 

```bash
git clone https://github.com/vllm-project/vllm.git
```


let's bring some text in

In [7]:
vllm_path = Path("vllm/")
vllm_repo = Repo(vllm_path)

In [8]:
# Get the latest commit
last_commit = vllm_repo.commit()

# Print the commit message
print(last_commit)

bc0644574ca12d754a031596bdcfe8e1f0e6ab39


Naively we can bring all MarkDown files

In [9]:
def find_files(directory, extension="*.py"):
    "Find all files of a given `extension` in a directory and return their content and path"
    md_files = []
    for file in Path(directory).rglob(extension):
        with open(file, 'r', encoding='utf-8') as md_file:
            content = md_file.read()
        md_files.append((file.relative_to(directory), content))
    return md_files

In [10]:
py_files = find_files(vllm_path)

file = random.choice(py_files)

len(py_files), print(file[1][0:500])

import copy
import time
from functools import partial
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union

from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm


(96, None)

We should stack the file with some metadata, for instance, the path where the files is coming from.

In [13]:
stuff = """<<Begin file>>
Path: {path}
---------
Content:
{content}
<<End File>>"""

In [14]:
print(stuff.format(path=file[0], content=file[1][0:500]))

<<Begin file>>
Path: vllm/engine/llm_engine.py
---------
Content:
import copy
import time
from functools import partial
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union

from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm
<<End File>>


In [15]:
import json
from tqdm.auto import tqdm

with open('vllm_python.jsonl', 'w') as json_file:
    for path, content in tqdm(py_files):
        data = stuff.format(path=path, content=content)
        json.dump({"text":data}, json_file)
        json_file.write('\n')

  0%|          | 0/96 [00:00<?, ?it/s]

## How much data do we have?

In [16]:
from transformers import AutoTokenizer

In [17]:
OS_MODEL = "codellama/CodeLlama-7b-Python-hf"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(OS_MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tokenized_sentence = tokenizer.encode("def hello_world():\n\tprint('Hello World!')")

In [24]:
len(tokenized_sentence)

14

In [25]:
print(tokenized_sentence)

[1, 822, 22172, 29918, 11526, 7295, 13, 12, 2158, 877, 10994, 2787, 29991, 1495]


Let's save each file on a separate line of a JSONL file

In [27]:
import json
def read_jsonl(fname):
    "Read a .jsonl file and return a list of dicts"
    with open(fname, 'r') as json_file:
        return [json.loads(line) for line in json_file]

In [28]:
data = read_jsonl("vllm_python.jsonl")

In [29]:
raw_corpus = "\n".join([d['text'] for d in data])

In [30]:
len(raw_corpus)

604130

In [31]:
tokenized_data = tokenizer.encode(raw_corpus)

In [33]:
tokens = len(tokenized_data)
print(f"VLLM .py files total tokens: {tokens/1_000_000}M")

VLLM .py files total tokens: 0.184048M


That's not a lot of tokens :P

## Save to W&B

In [20]:
import wandb

with wandb.init(project="vllm_llm"):
    at = wandb.Artifact(name="vllm_python", 
                        description="The .py files from the vllm library",
                        type="dataset",
                        metadata={
                            "url": "https://github.com/vllm-project/vllm.git",
                            "commit":last_commit,
                            "remote": vllm_repo.remote().url,
                            "tokens": tokens})
    at.add_file("vllm_python.jsonl")
    
    wandb.log_artifact(at)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




VBox(children=(Label(value='0.645 MB of 0.660 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.978074…