# Dataset preparing guide for code infilling task

Prepare dataset for deepseek-coder model training

- Deepseek-coder Model card: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
- LLama factory dataset format: https://llamafactory.readthedocs.io/zh-cn/latest/getting_started/data_preparation.html
- Model Hub: https://github.com/aws-samples/llm_model_hub
- Reference blog: https://aws.amazon.com/cn/blogs/china/yxt-innovative-practice-of-fine-tuning-large-models-and-enabling-code-generation-based-on-amazon-sagemaker/

In [1]:
!pip install smart_open datasets tree_sitter tree_sitter_python tree_sitter_java tree_sitter_javascript tree_sitter_cpp transformers
# you need to login to "huggingface-cli login"



## Download sample dataset

Here download the bigcode/the-stack-v2-train-smol-ids sample dataset for example, you can prepare your own code data in code directory

In [2]:
import os
import boto3
from smart_open import open
from datasets import load_dataset
from tqdm import tqdm

session = boto3.Session()
s3 = session.client("s3")

def download_contents(files):
    for file in files:
        s3_url = f"s3://softwareheritage/content/{file['blob_id']}"
        with open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
            file["content"] = fin.read().decode(file["src_encoding"])
    
    return {"files": files}

ds = load_dataset("bigcode/the-stack-v2-train-smol-ids", split="train", streaming=True)
ds = ds.map(lambda row: download_contents(row["files"]))
filetype = {
    "java",
    "js",
    "py",
}

import os
os.makedirs("code", exist_ok=True) 


total_count = 1000
count = 0
progress = tqdm(total=total_count)
for row in ds:
    for file in row["files"]:
        filename = file["path"].split("/")[-1]
        file_suffix = filename.split(".")[-1]
        if file_suffix not in filetype:
            continue
        open(f"code/{file['content_id']}.{file_suffix}", "w").write(file["content"])
        if count > total_count:
            break
        count += 1
        progress.update(1)
    if count > total_count:
        break

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

 78%|███████▊  | 775/1000 [04:53<20:00,  5.34s/it]

## Preprocess code
1. Preprocessing code by doing code AST
2. Split code to prefix + middle + suffix randomly
3. Apply prompt template to Deepseek-Coder format

In [3]:
import glob
import json
import os
from tqdm import tqdm
from code_splitter import split_code_randomly
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-base")


def read_files(directory):
    # Only process .java and .py files
    patterns = ['*.java', '*.py', '*.js']
    files = []
    for pattern in patterns:
        files.extend(glob.glob(os.path.join(directory, pattern)))
    
    for file_path in files:
        # Skip directories
        if os.path.isdir(file_path):
            continue
        try:
            with open(file_path, 'r') as file:
                yield file.read(), os.path.basename(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}")
            continue

def code_to_text(prefix, middle, suffix):
    input_text = f"<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>{middle}<｜end▁of▁sentence｜>"
    return {"text": input_text}
    # return json_line

def test_split_code_randomly():
    directory = os.path.abspath(os.path.join(os.path.dirname(__file__), '../the-stack-v2-train-smol-ids', 'code'))
    for code, file_name in read_files(directory):
        print(f"Testing file: {file_name}")
        
        # Test with default parameters
        result = split_code_randomly(code, language=file_name.split(".")[-1])
        
        # Test with custom parameters
        min_length = random.randint(1, 10)
        max_length = random.randint(16, 64)
        result = split_code_randomly(code, language='java', min_middle_length=min_length, max_middle_length=max_length)
        assert len(result['prefix']) + len(result['middle']) + len(result['suffix']) == len(code)
        assert result['prefix'] + result['middle'] + result['suffix'] == code
        
        print(f"  Prefix length: {len(result['prefix'])}")
        print(f"  Middle length: {len(result['middle'])}")
        print(f"  Suffix length: {len(result['suffix'])}")
        print("  Test passed!")
        print()

def code_to_dataset(code_path, output_path, min_middle_length=1, max_middle_length=64, splits_per_file=10, min_tokens=4, max_tokens=32768):
    """
    Convert code files to a dataset of split code in jsonlines format.

    Args:
        code_path: Directory containing source code files
        output_path: Path to write the output jsonlines file
        min_middle_length: Minimum length of the middle section
        max_middle_length: Maximum length of the middle section
        splits_per_file: Number of different splits to generate for each file
    """

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    datasets = []
    tokens = 0
    # Process each code file
    for code, file_name in tqdm(read_files(code_path)):
        try:
            # Get file extension (language)
            language = file_name.split(".")[-1]
            

            # Generate multiple splits for each file
            for i in range(splits_per_file):
                # Split the code
                split_result = split_code_randomly(
                    code,
                    language=language,
                    min_middle_length=min_middle_length,
                    max_middle_length=max_middle_length
                )

                assert split_result['prefix'] + split_result['middle'] + split_result['suffix'] == code

                # Convert to text format and write to file
                json_line = code_to_text(
                    split_result['prefix'],
                    split_result['middle'],
                    split_result['suffix']
                )

                token_count = tokenizer(json_line["text"], return_tensors="pt").input_ids.shape[1]
                if token_count > max_tokens or token_count < min_tokens:
                    print(f"Skip {file_name}, token count {token_count}")
                    break
                tokens += token_count
                datasets.append(json_line)
            else:
                # print(f"Generated {splits_per_file} splits for {file_name}")
                pass

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue
    print(f"average tokens {tokens / len(datasets):.2f}")
    json.dump(datasets, open(output_path, 'w'), ensure_ascii=False)

In [4]:
# Current directory as code path
code_path = "./code"  # Current directory containing source code files
output_path = "./code_dataset.json"  # Output jsonlines file

# Create dataset with custom parameters
code_to_dataset(
    code_path=code_path,
    output_path=output_path,
    min_middle_length=1,
    max_middle_length=64,
    splits_per_file=5,  # Generate 5 different splits for each file
    max_tokens=32768,  # Max tokens per file, this should be considered with "cutoff_len" parameter in LLamaFactory
)
print("Dataset generation complete")

json.dump({
    "code_dataset":{
        "file_name": output_path,
        "columns": {
            "prompt": "text"
        }
    }
}, open("./dataset_info.json", 'w'), ensure_ascii=False)


0it [00:00, ?it/s][A
3it [00:00, 27.77it/s][A
11it [00:00, 47.23it/s][A
18it [00:00, 52.90it/s][A
24it [00:00, 53.51it/s][A
32it [00:00, 52.32it/s][A
38it [00:00, 36.30it/s][A
43it [00:01, 36.41it/s][A
52it [00:01, 46.35it/s][A
58it [00:01, 38.52it/s][A
65it [00:01, 43.41it/s][A
70it [00:01, 34.93it/s][A
75it [00:01, 34.33it/s][A
83it [00:01, 43.57it/s][A
89it [00:02, 33.53it/s][A
95it [00:02, 38.34it/s][A
101it [00:02, 42.55it/s][AToken indices sequence length is longer than the specified maximum sequence length for this model (29957 > 16384). Running this sequence through the model will result in indexing errors

107it [00:03, 12.97it/s][A
115it [00:03, 18.18it/s][A
122it [00:03, 23.36it/s][A
128it [00:04, 25.43it/s][A
133it [00:04, 26.05it/s][A
141it [00:04, 34.08it/s][A
150it [00:04, 44.00it/s][A
157it [00:04, 42.98it/s][A
165it [00:04, 45.40it/s][A
171it [00:04, 47.36it/s][A
181it [00:05, 58.71it/s][A
188it [00:05, 25.32it/s][A
195it [00:05, 30.69it/s

average tokens 925.52
Dataset generation complete


## Prepare for training

You can start training job on SageMaker, example: https://github.com/xqun3/Training_On_SageMaker

or via Modelhub (based on LLamaFactory): https://github.com/aws-samples/llm_model_hub

The dataset info file for LLamaFactory is like this:

``` json
{
    "code_dataset":{
        "file_name":"code_dataset.json",
        "columns": {
            "prompt": "text"
        }
    }
}

```
