In [None]:
!apt-get update -y
!apt-get install -y build-essential
!pip install -q tree_sitter boto3 smart_open[s3] datasets
!pip install -U fsspec datasets aiohttp

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 http://security.ubuntu.com/ubuntu jammy-security InRelease
Fetched 255 kB in 2s (115 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package list

In [None]:
!git clone https://github.com/bigcode-project/starcoder2-self-align.git
%cd starcoder2-self-align/seed_gathering

fatal: destination path 'starcoder2-self-align' already exists and is not an empty directory.
/content/starcoder2-self-align/seed_gathering


In [None]:
# STEP 1: Uninstall incorrect tree_sitter
!pip uninstall -y tree_sitter

# STEP 2: Install correct official version
!pip install tree-sitter==0.20.1

# STEP 3: Remove old grammar if it exists
!rm -rf tree-sitter-c-sharp

# STEP 4: Clone C# grammar correctly (with submodules!)
!git clone --recurse-submodules https://github.com/tree-sitter/tree-sitter-c-sharp.git

# STEP 5: Build the parser shared object
from tree_sitter import Language, Parser

# Build the C# grammar into a shared library (this creates 'build/cs-lang.so')
Language.build_library(
    'build/cs-lang.so',  # Output path for the compiled library
    ['tree-sitter-c-sharp']  # Path to the cloned C# grammar
)

# Load the C# language from the built library
C_SHARP_LANGUAGE = Language('build/cs-lang.so', 'c_sharp')

# Initialize a new parser for C# language
parser = Parser()
parser.set_language(C_SHARP_LANGUAGE)

Found existing installation: tree_sitter 0.20.1
Uninstalling tree_sitter-0.20.1:
  Successfully uninstalled tree_sitter-0.20.1
Collecting tree-sitter==0.20.1
  Using cached tree_sitter-0.20.1-cp311-cp311-linux_x86_64.whl
Installing collected packages: tree-sitter
Successfully installed tree-sitter-0.20.1
Cloning into 'tree-sitter-c-sharp'...
remote: Enumerating objects: 3966, done.[K
remote: Counting objects: 100% (858/858), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 3966 (delta 785), reused 752 (delta 740), pack-reused 3108 (from 3)[K
Receiving objects: 100% (3966/3966), 122.10 MiB | 12.25 MiB/s, done.
Resolving deltas: 100% (2819/2819), done.


In [None]:
import sys
sys.path.append('/content/starcoder2-self-align/seed_gathering')

### SEED GATHERING GET CONTENT

In [None]:
from tree_sitter_parser import make_parser, get_fns_with_doc_comments, LANGUAGE, node_to_string
import datasets
import os
import signal
from multiprocessing import Pool
#import os
import boto3
import smart_open
#from datasets import load_dataset,Dataset
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)

    return content

In [None]:
def parse_ex(parser, ex):
    ex = download_contents(ex["blob_id"], ex["src_encoding"])
    try:
        buf = bytes(ex, "utf8")
        tree = parser.parse(buf)
        return get_fns_with_doc_comments(buf, tree)
    except Exception as e:
        print(f"Error parsing: {e}")
        return []

PARSERS = None  # Global for multiprocessing

def process_chunk(idx_and_chunk):
    assert PARSERS is not None
    idx, chunk = idx_and_chunk
    parser = PARSERS[idx]
    chunk_new_funs = set()
    for ex in chunk:
        chunk_new_funs.update(parse_ex(parser, ex))
    return chunk_new_funs

def main(args):
    global PARSERS
    ds = datasets.load_dataset(
        args.dataset,
        data_dir=args.data_dir,
        split="train",
    )

    funs = set()
    PARSERS = [make_parser() for _ in range(args.num_workers)]
    total_len = len(ds)
    CHUNK_SIZE = 1000 * args.num_workers

    print(f"Total length: {total_len}")
    print(f"Chunk size: {CHUNK_SIZE}")

    chunk = []
    p = Pool(args.num_workers)
    for i, ex in enumerate(ds):
        if ex["language"] != "C#":
            continue  # ✅ only parse C# files

        if i % (total_len // 100 + 1) == 0:
            print(f"{i}/{total_len}")

        try:
            chunk.append(ex)
            if len(chunk) == CHUNK_SIZE or i == total_len - 1:
                print(f"Processing chunk {i // CHUNK_SIZE}")
                subchunk_size = len(chunk) // args.num_workers
                subchunks = [chunk[i:i + subchunk_size] for i in range(0, len(chunk), subchunk_size)]

                new_funs_iter = p.imap(
                    process_chunk, [(i, subchunk) for i, subchunk in enumerate(subchunks)]
                )

                print("Getting new functions")
                len_before = len(funs)
                while True:
                    try:
                        def timeout_handler(_, __): raise KeyboardInterrupt
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)
                        funs.update(next(new_funs_iter))
                        signal.alarm(0)
                    except KeyboardInterrupt:
                        signal.alarm(0)
                        print("Timeout: restarting pool")
                        p.terminate()
                        p = Pool(args.num_workers)
                        break
                    except StopIteration:
                        break
                    except Exception as e:
                        print(e)

                signal.alarm(0)
                PARSERS = [make_parser() for _ in range(args.num_workers)]
                print(f"Done processing chunk {i // CHUNK_SIZE}. Got {len(funs) - len_before} new functions.")
                chunk = []

        except Exception as e:
            print(e)
            chunk = []

    p.close()

    new_ds_dict = {
        "content": list(funs),
        "id": list(range(len(funs)))
    }

    new_ds = datasets.Dataset.from_dict(new_ds_dict)
    return new_ds

In [None]:
NUMWORKERS = os.cpu_count()

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
ds = datasets.load_dataset(
    "bigcode/the-stack-v2-dedup",
    "C-Sharp",
    streaming=True,
    split="train"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/89.2k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/757 [00:00<?, ?it/s]

In [None]:
from itertools import islice

MAX_FUNS = 4000  # 💡 stop after collecting this many functions

funs = set()
PARSERS = [make_parser() for _ in range(NUMWORKERS)]
CHUNK_SIZE = 1000 * NUMWORKERS

print(f"Streaming mode: processing in chunks of {CHUNK_SIZE}")

chunk = []
p = Pool(NUMWORKERS)

stop_processing = False  # 🚩 flag to exit outer loop

for i, ex in enumerate(ds):
    if stop_processing:
        break

    if ex["language"] != "C#":
        continue

    chunk.append(ex)

    if i % 10000 == 0:
        print(f"Seen {i} examples so far...")

    if len(chunk) == CHUNK_SIZE:
        print(f"Processing chunk ending at #{i}")
        subchunk_size = len(chunk) // NUMWORKERS
        subchunks = [chunk[j:j + subchunk_size] for j in range(0, len(chunk), subchunk_size)]

        try:
            new_funs_iter = p.imap(
                process_chunk, [(i, subchunk) for i, subchunk in enumerate(subchunks)]
            )
            for fun_set in new_funs_iter:
                funs.update(fun_set)
                if len(funs) >= MAX_FUNS:
                    print(f"Reached {MAX_FUNS} functions. Stopping early.")
                    stop_processing = True
                    break
        except Exception as e:
            print(f"Error while processing chunk: {e}")
            p.terminate()
            p = Pool(NUMWORKERS)

        PARSERS = [make_parser() for _ in range(NUMWORKERS)]
        print(f"Collected {len(funs)} functions so far.")
        chunk = []

# Final flush if not already done and limit not reached
if chunk and not stop_processing:
    print("Final chunk processing")
    subchunk_size = len(chunk) // NUMWORKERS
    subchunks = [chunk[j:j + subchunk_size] for j in range(0, len(chunk), subchunk_size)]
    new_funs_iter = p.imap(
        process_chunk, [(i, subchunk) for i, subchunk in enumerate(subchunks)]
    )
    for fun_set in new_funs_iter:
        funs.update(fun_set)
        if len(funs) >= MAX_FUNS:
            print(f"Reached {MAX_FUNS} functions during final chunk. Stopping.")
            break

p.close()

# Save results
new_ds_dict = {
    "content": list(funs),
    "id": list(range(len(funs)))
}
new_ds = datasets.Dataset.from_dict(new_ds_dict)

Streaming mode: processing in chunks of 2000
Seen 0 examples so far...
Processing chunk ending at #1999
Collected 1111 functions so far.
Processing chunk ending at #3999
Collected 2433 functions so far.
Processing chunk ending at #5999
Collected 3561 functions so far.
Processing chunk ending at #7999
Reached 4000 functions. Stopping early.
Collected 4072 functions so far.


In [None]:
ds = new_ds

In [None]:
ds

Dataset({
    features: ['content', 'id'],
    num_rows: 4072
})

### SEED GATHERING HIGH-QUALITY SUBSET

In [None]:
import subprocess
import tempfile
import signal
import hashlib
import os
import argparse
from typing import List, Dict
from tqdm import tqdm
from tree_sitter_parser import LANGUAGE, global_parser

RETURN_QUERY = LANGUAGE.query("""
(return_statement) @return
""")

def does_have_return(src):
    tree = global_parser.parse(bytes(src, "utf8"))
    root = tree.root_node
    captures = RETURN_QUERY.captures(root)
    for node, _ in captures:
        # if it doesn't have an argument, it's not a return with a value
        if len(node.children) <= 1:  # includes "return" itself
            continue
        else:
            return True
    return False

# runs mypy in the given directory, returns stdout
# then, it logs the number of errors for each file
def run_mypy(d):
    try:
        outs = subprocess.run(
            ["mypy", "."],
            cwd=d,
            capture_output=True,
            timeout=120,
            text=True,
        ).stdout
    except Exception as e:
        print(e)
        return None

    filemap = {}
    lines = outs.split("\n")
    for line in lines:
        if line.strip():
            parts = line.split(":")
            if len(parts) >= 2:
                file = parts[0].split("/")[-1]
                if file not in filemap:
                    filemap[file] = 0
                if "error:" in line:
                    filemap[file] += 1

    return filemap

def typecheck_batch(files: List[str]) -> Dict[str, str]:
    # Create a temporary directory using the tempfile module
    filemap: Dict[str, str] = {}
    with tempfile.TemporaryDirectory() as tempdir:
        for contents in files:
            hash_object = hashlib.sha1(bytes(contents, "utf8"))
            hex_dig = hash_object.hexdigest()
            filemap[hex_dig] = contents
            name = os.path.join(tempdir, hex_dig + ".py")
            with open(name, "w") as f:
                f.write(contents)

        # Run mypy in the temporary directory
        typecheck_map = run_mypy(tempdir)
        print(typecheck_map)

        if typecheck_map is None:
            return {}

        for contents, errors in typecheck_map.items():
            no_py = contents.replace(".py", "")
            if errors == 0:
                continue
            if no_py in filemap:
                del filemap[no_py]

        print(f"Pass rate: {len(filemap)}/{len(files)}")
        return filemap

def infer_imports(code: str) -> str:
    import autoimport
    try:
        def handler(signum, frame):
            raise Exception("Timeout")
        signal.signal(signal.SIGALRM, handler)
        signal.alarm(10)
        inferred = autoimport.fix_code(code)
        signal.alarm(0)
        return inferred
    except Exception as e:
        signal.alarm(0)
        print(f"Error while inferring imports: {e}")
        return code

In [None]:
print("Filtering to only functions with return statements")
ds = ds.filter(lambda ex: does_have_return(
    ex["content"]), num_proc=os.cpu_count())




Filtering to only functions with return statements


Filter (num_proc=2):   0%|          | 0/4072 [00:00<?, ? examples/s]

In [None]:
ds

Dataset({
    features: ['content', 'id'],
    num_rows: 1942
})

In [None]:
import pandas as pd

df = ds.to_pandas()
df.head()

Unnamed: 0,content,id
0,///<summary>\n/// Returns a deep copy of the c...,0
1,/// <summary>\n/// 初始化任务集合\n/// </summary>\n//...,2
2,/// <summary>\n/// Tries to convert the string...,3
3,/// <summary>\n/// Returns the concrete implem...,4
4,/// <summary>\n/// Gets the positive modulo of...,5


In [None]:
from tqdm import tqdm
import hashlib

batch = []
new_ds = {
    "content": [],
    "sha1": [],
    "id": [],
}

e_id = 0
max_i = len(ds) - 1 if not hasattr(ds, "is_streaming") or not ds.is_streaming else None

for i, ex in enumerate(tqdm(ds, total=max_i or None)):
    try:
        # ✅ No need to filter by language again; you already did in the earlier step
        code = ex["content"]
        batch.append(code)

        if len(batch) == 250 or (max_i is not None and i == max_i):
            for code in batch:
                sha1 = hashlib.sha1(code.encode("utf-8")).hexdigest()
                if sha1 not in new_ds["sha1"]:
                    new_ds["content"].append(code)
                    new_ds["sha1"].append(sha1)
                    new_ds["id"].append(e_id)
                    e_id += 1
            batch = []

    except Exception as e:
        print(f"There was an error at index {i}: {e}")
        continue

# Convert to Hugging Face Dataset
new_ds_hf = datasets.Dataset.from_dict(new_ds)

1942it [00:00, 9463.68it/s]                           


In [None]:
new_ds_hf

Dataset({
    features: ['content', 'sha1', 'id'],
    num_rows: 1942
})

In [None]:
import pandas as pd

df1 = new_ds_hf.to_pandas()
df1.head()

Unnamed: 0,content,sha1,id
0,///<summary>\n/// Returns a deep copy of the c...,48ffac77b1a4a6444e1235ee084392bb2be244ce,0
1,/// <summary>\n/// 初始化任务集合\n/// </summary>\n//...,7b7e490cab179bc3ddacc8c7f4b0fb31cbaf40d9,1
2,/// <summary>\n/// Tries to convert the string...,d243747f1453623992ca57b3722d1f2a6d2ab915,2
3,/// <summary>\n/// Returns the concrete implem...,0e50fccc48733361aeb4e3eada840d90765c05ff,3
4,/// <summary>\n/// Gets the positive modulo of...,25edf92bb7c479536bf8e36a088d730c252232dd,4


In [None]:
import hashlib
from tqdm import tqdm

batch = []
new_ds = {
    "content": [],
    "sha1": [],
    "id": [],
}

e_id = 0
MAX_FUNS = 10000  # Stop early if desired

print("Starting hashing deduplication...")

for i, ex in enumerate(tqdm(ds)):
    try:
        code = ex["content"]
        batch.append(code)

        if len(batch) == 250:
            for code in batch:
                sha1 = hashlib.sha1(code.encode("utf-8")).hexdigest()
                if sha1 not in new_ds["sha1"]:
                    new_ds["content"].append(code)
                    new_ds["sha1"].append(sha1)
                    new_ds["id"].append(e_id)
                    e_id += 1

                if len(new_ds["content"]) >= MAX_FUNS:
                    print(f"✅ Reached {MAX_FUNS} functions. Stopping.")
                    batch = []
                    break

            batch = []

        if len(new_ds["content"]) >= MAX_FUNS:
            break

    except Exception as e:
        print(f"Error at index {i}: {e}")
        continue

# Final batch flush if needed
if batch and len(new_ds["content"]) < MAX_FUNS:
    for code in batch:
        sha1 = hashlib.sha1(code.encode("utf-8")).hexdigest()
        if sha1 not in new_ds["sha1"]:
            new_ds["content"].append(code)
            new_ds["sha1"].append(sha1)
            new_ds["id"].append(e_id)
            e_id += 1

        if len(new_ds["content"]) >= MAX_FUNS:
            print(f"✅ Reached {MAX_FUNS} functions in final batch. Done.")
            break

# --- CONVERT TO HF DATASET OBJECT ---
new_ds_hf = datasets.Dataset.from_dict(new_ds)
print(f"✅ Final dataset has {len(new_ds_hf)} functions.")

Starting hashing deduplication...


100%|██████████| 1942/1942 [00:00<00:00, 5950.51it/s]

✅ Final dataset has 1942 functions.





In [None]:
new_ds_hf

Dataset({
    features: ['content', 'sha1', 'id'],
    num_rows: 1942
})

In [None]:
save_dir = "../datasets/seed2"

In [None]:
new_ds_hf.save_to_disk(save_dir)

Saving the dataset (0/1 shards):   0%|          | 0/1942 [00:00<?, ? examples/s]

In [None]:
import pandas as pd

df2 = new_ds_hf.to_pandas()
df2.head()

Unnamed: 0,content,sha1,id
0,///<summary>\n/// Returns a deep copy of the c...,48ffac77b1a4a6444e1235ee084392bb2be244ce,0
1,/// <summary>\n/// 初始化任务集合\n/// </summary>\n//...,7b7e490cab179bc3ddacc8c7f4b0fb31cbaf40d9,1
2,/// <summary>\n/// Tries to convert the string...,d243747f1453623992ca57b3722d1f2a6d2ab915,2
3,/// <summary>\n/// Returns the concrete implem...,0e50fccc48733361aeb4e3eada840d90765c05ff,3
4,/// <summary>\n/// Gets the positive modulo of...,25edf92bb7c479536bf8e36a088d730c252232dd,4


In [None]:
df2.to_csv('dataset.csv', index=False)

from google.colab import files
files.download('dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r /content/starcoder2-self-align/datasets /content/drive/MyDrive/

### SEED GATHERING FILTER DATASET

In [None]:
!pip install vllm



In [None]:
!unzip /content/datasets-20250514T224535Z-1-001.zip -d /content/

Archive:  /content/datasets-20250514T224535Z-1-001.zip
  inflating: /content/datasets/seed2/dataset_info.json  
  inflating: /content/datasets/seed2/state.json  
  inflating: /content/datasets/seed2/data-00000-of-00001.arrow  


In [None]:
from datasets import load_from_disk

new_ds_hf = load_from_disk("/content/datasets/seed2")

In [None]:
new_ds_hf
new_ds_hf[0]  # View the first entry

{'content': '///<summary>\n/// Returns a deep copy of the child collection object passed in.\n///</summary>\npublic static object MakeCopyOf(object x, IDictionary existingCopies)\n\t\t{\n\t\t\tif (x == null)\n\t\t\t\treturn null;\n\t\t\t\n\t\t\tif (x is ICloneableEx)\n\t\t\t{\n\t\t\t\t// Return a deep copy of the object\n\t\t\t\treturn ((ICloneableEx)x).Clone(existingCopies);\n\t\t\t}\n\t\t\telse if (x is ICloneable)\n\t\t\t{\n\t\t\t\t// Return a deep copy of the object\n\t\t\t\treturn ((ICloneable)x).Clone();\n\t\t\t}\n\t\t\telse\n\t\t\t\tthrow new System.NotSupportedException("Object Does Not Implement the ICloneable or IClonableEx Interface.");\n\t\t}',
 'sha1': '48ffac77b1a4a6444e1235ee084392bb2be244ce',
 'id': 0}

In [None]:
import datasets
import os
from tree_sitter_parser import global_parser, LANGUAGE, does_have_return, make_parser
import benchmark_data
from tqdm import tqdm
import torch
import argparse
from vllm import LLM, SamplingParams
import random

In [None]:
FN_BLOCK_QUERY = LANGUAGE.query("""
(local_function_statement
  body: (block) @fn-block)

(method_declaration
  body: (block) @fn-block)
""")


def template_few_shot(code, answer, rationale):
    doc, code = cs_extract_docstring(code)  # Use the C# docstring extractor
    assert answer in ("Yes", "No")
    prompt = f"""<issue_start>username_0: I have a method in C# and I'd like someone to check my description of this method.
I'm doing this so that I can write a good XML-style doc comment for this method.

Here is the code for the method:
```csharp
{code}
```

Here is my description of this program:
```
{doc}
```

Do not attempt to execute the method or to judge its correctness.
Answer with "Yes" or "No" depending on if my description has enough information alone to re-implement the method.
Also, answer with "No" if the description does not match the method.<issue_comment>username_1: Sure, no problem. I will be able to help.
My answer is: {answer}

{rationale}

Upvotes: 200"""
    return prompt


FEW_SHOTS = [
    (
        '''/// <summary>Converts inches to meters.</summary>
public double InchesToMeters(double inches)
{
    return inches * 0.0254;
}''',
        "Yes",
        "The method is simple and its summary matches the implementation exactly: it converts inches to meters using a fixed multiplier."
    ),
    (
        '''/// <summary>Sums all integers in a list.</summary>
public int Sum(List<int> numbers)
{
    int total = 0;
    foreach (int num in numbers)
    {
        total += num;
    }
    return total;
}''',
        "Yes",
        "The summary clearly explains the purpose of summing all numbers in the list, and it matches the method perfectly."
    ),
    (
        '''/// <summary>Initializes the network settings.</summary>
public List<string> InitNetwork()
{
    List<string> addresses = new List<string>();
    for (int i = 1; i <= 254; i++)
    {
        addresses.Add("192.168.1." + i.ToString());
    }
    return addresses;
}''',
        "No",
        "The summary is too vague. It doesn’t mention the specific IP range being generated or that it's hardcoded to 192.168.1.x."
    ),
    (
        '''/// <summary>Multiplies all values in an array.</summary>
public int Product(int[] values)
{
    int result = 1;
    foreach (int val in values)
    {
        result *= val;
    }
    return result;
}''',
        "Yes",
        "The summary directly describes what the method does — multiplying all values in an array — and matches the logic."
    ),
    (
        '''/// <summary>Converts Fahrenheit to Celsius.</summary>
public double ToCelsius(double fahrenheit)
{
    return (fahrenheit - 32) * 5.0 / 9.0;
}''',
        "Yes",
        "This is a textbook temperature conversion formula, and the summary matches the implementation precisely."
    ),
    (
        '''/// <summary>Finds the max value in a list.</summary>
public int FindMin(List<int> list)
{
    return list.Min();
}''',
        "No",
        "The summary says 'finds the max value' but the method actually finds the **minimum** value using `list.Min()`."
    ),
    (
        '''/// <summary>Returns true if number is even.</summary>
public bool IsOdd(int number)
{
    return number % 2 != 0;
}''',
        "No",
        "The summary incorrectly claims the method checks for evenness, but the actual code checks for **odd** numbers."
    )
]

def prompt_fmt(code):
    doc, code = cs_extract_docstring(code)
    random.shuffle(FEW_SHOTS)
    buf = ""
    for few in FEW_SHOTS:
        buf += template_few_shot(*few)
    buf += f"""<issue_start>username_0: I have a method in C# and I'd like someone to check my description of this method.
I'm doing this so that I can write a good XML-style doc comment for this method.

Here is the code for the method:
```csharp
{code}
```

Here is my description of this program:
```
{doc}
```

Do not attempt to execute the method or to judge its correctness.
Answer with "Yes" or "No" depending on if my description has enough information alone to re-implement the method.
Also, answer with "No" if the description does not match the method.
Upvotes: 100<issue_comment>username_1: Sure, no problem. I will be able to help.
My answer is:"""
    return buf


def auto_dtype():
    if torch.cuda.is_bf16_supported():
        return "bfloat16"
    return "auto"


def chunkify(lst, n):
    chunks = []
    for i in range(0, len(lst), n):
        chunk = []
        for j in range(n):
            if i + j < len(lst):
                chunk.append(lst[i + j])
        chunks.append(chunk)
    return chunks


In [None]:
dataset = new_ds_hf

In [None]:
print("🔎 Checking first method sample for block match...\n")

sample_code = dataset[0]["content"]
print("Code snippet:")
print("\n".join(sample_code.strip().splitlines()[:10]))
print("...\n")

tree = global_parser.parse(sample_code.encode("utf-8"))
root = tree.root_node
print("Tree-sitter S-expression:")
print(root.sexp()[:1000])  # truncate long output

captures = FN_BLOCK_QUERY.captures(root)
print(f"\n🧩 Captures found by FN_BLOCK_QUERY: {len(captures)}")
if captures:
    block, _ = captures[0]
    print("\n✅ Matched block content:")
    start, end = block.start_byte, block.end_byte
    print(sample_code[start:end])
else:
    print("❌ No method block matched. Your Tree-sitter query might not align with the parsed structure.")

print("\n🔍 First-level child node types:")
for child in root.children[:15]:
    print(f"- {child.type}")

🔎 Checking first method sample for block match...

Code snippet:
///<summary>
/// Returns a deep copy of the child collection object passed in.
///</summary>
public static object MakeCopyOf(object x, IDictionary existingCopies)
		{
			if (x == null)
				return null;
			
			if (x is ICloneableEx)
			{
...

Tree-sitter S-expression:
(compilation_unit (comment) (comment) (comment) (global_statement (local_function_statement (modifier) (modifier) type: (predefined_type) name: (identifier) parameters: (parameter_list (parameter type: (predefined_type) name: (identifier)) (parameter type: (identifier) name: (identifier))) body: (block (if_statement condition: (binary_expression left: (identifier) right: (null_literal)) consequence: (return_statement (null_literal))) (if_statement condition: (is_pattern_expression expression: (identifier) pattern: (constant_pattern (identifier))) consequence: (block (comment) (return_statement (invocation_expression function: (member_access_expression express

In [None]:
print(new_ds_hf)
print(f"Length: {len(new_ds_hf)}")

Dataset({
    features: ['content', 'sha1', 'id'],
    num_rows: 1942
})
Length: 1942


In [None]:
print(f"Loaded {len(dataset)} examples. Running pre-filtering...")

# Filter out C#-irrelevant or low-quality functions
BAD_WORDS = ["todo", "fixme", "bug", "hack"]
BAD_USINGS = ["System.Diagnostics", "System.Runtime.InteropServices"]
BAD_METHODS = ["Process.Start", "DllImport", "Console.ReadLine"]

# Patterns we don't want in seed functions
BAD_SUBSTRINGS = BAD_WORDS + BAD_USINGS + BAD_METHODS

# If benchmark filtering is used — this assumes benchmark_data is a dictionary
bench_filter = benchmark_data.filter_out()
all_bench = bench_filter.get("human_eval_docstrings", []) + \
            bench_filter.get("human_eval_solutions", []) + \
            bench_filter.get("mbpp_docstrings", []) + \
            bench_filter.get("mbpp_solutions", [])

Loaded 1942 examples. Running pre-filtering...
num strings from mbpp_docstrings: 120
num strings from mbpp_solutions: 120
num strings from human_eval_docstrings: 164
num strings from human_eval_solutions: 161


In [None]:
dataset = new_ds_hf

In [None]:
from collections import Counter
import os

REJECTED = []

def cs_extract_docstring(code: str):
    """
    Splits docstring (`///`) and method body from a C# method.
    Returns (docstring, code).
    """
    lines = code.strip().splitlines()
    doc = "\n".join(line.strip().replace("///", "").strip() for line in lines if line.strip().startswith("///"))
    body = "\n".join(line for line in lines if not line.strip().startswith("///"))
    return doc.strip(), body.strip()

def pre_filtering(ex):
    try:
        code = ex["content"]
        code_bytes = code.encode("utf-8")

        # Filter 1: Banned substrings
        lower = code.lower()
        for word in BAD_SUBSTRINGS:
            if word.lower() in lower:
                REJECTED.append(("bad_substring", code))
                return False

        # Filter 2: Benchmark contamination
        for b in all_bench:
            if b in code:
                REJECTED.append(("benchmark_overlap", code))
                return False

        # Filter 3: Too long
        if len(code.splitlines()) > 150:
            REJECTED.append(("too_long", code))
            return False

        # Filter 4: Must have a return statement
        if not does_have_return(code):
            REJECTED.append(("no_return", code))
            return False

        # Filter 5: Must include a docstring (`///`)
        doc, _ = cs_extract_docstring(code)
        if not doc or len(doc.split()) < 3:
            REJECTED.append(("no_docstring", code))
            return False

        # Filter 6: Must have valid method block parsed
        tree = global_parser.parse(code_bytes)
        captures = FN_BLOCK_QUERY.captures(tree.root_node)
        if not captures:
            REJECTED.append(("no_method_block", code))
            return False

        return True

    except Exception as e:
        REJECTED.append(("exception", str(e)))
        return False

# Run the filter
print(f"Loaded {len(dataset)} examples. Running pre-filtering...")

threads = os.cpu_count() - 1 if os.cpu_count() > 1 else 1
dataset = dataset.filter(pre_filtering, num_proc=threads)

# Show rejection summary
print("\n🔍 Rejected samples breakdown:")
reason_counts = Counter([r[0] for r in REJECTED])
for reason, count in reason_counts.items():
    print(f"- {reason}: {count}")
    for rtype, code in REJECTED:
        if rtype == reason:
            print("  Example:")
            print("  " + "\n  ".join(code.strip().splitlines()[:5]))
            print("  ...\n")
            break

print(f"\n✅ Final filtered dataset size: {len(dataset)}")



Loaded 1942 examples. Running pre-filtering...


Filter (num_proc=11):   0%|          | 0/1942 [00:00<?, ? examples/s]


🔍 Rejected samples breakdown:

✅ Final filtered dataset size: 1708


In [None]:
dataset

Dataset({
    features: ['content', 'sha1', 'id'],
    num_rows: 1708
})

In [None]:
import os
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"  # ⬅️ Set before importing vllm

from vllm import LLM, SamplingParams
import torch

def auto_dtype():
    return "bfloat16" if torch.cuda.is_bf16_supported() else "float16"

model = LLM(
    model="bigcode/starcoder2-15b",
    dtype=auto_dtype(),
    gpu_memory_utilization=0.9,
    tensor_parallel_size=1,
)

INFO 05-15 00:39:25 [config.py:2968] Downcasting torch.float32 to torch.bfloat16.
INFO 05-15 00:39:25 [config.py:717] This model supports multiple tasks: {'classify', 'reward', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 05-15 00:39:25 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.


KeyboardInterrupt: 

In [None]:
tokenizer = model.get_tokenizer()

In [None]:
print(f"Now running stage 3 filtering on {len(dataset)} examples...")

Now running stage 3 filtering on 1708 examples...


In [None]:
def unindent(s):
    lines = s.splitlines()
    non_blank_lines = [line for line in lines if line.strip()]
    min_indent = min(len(line) - len(line.lstrip())
                     for line in non_blank_lines) if non_blank_lines else 0
    unindented_lines = [line[min_indent:] if len(
        line) >= min_indent else line for line in lines]
    return '\n'.join(unindented_lines)


def py_extract_docstring(code):
    first_doc = code.find('"""')
    assert first_doc != -1
    first_doc = first_doc + 3
    second_doc = code[first_doc+1:].find('"""')
    assert second_doc != -1
    second_doc = second_doc + first_doc + 1
    doc = code[first_doc:second_doc]
    doc = unindent(doc).strip()
    code = code[:first_doc-3] + code[second_doc+3:]
    return doc, code



In [None]:
dummy = 'def dummy(): \n    """\n    """\n pass'
dummy_prompt = prompt_fmt(dummy)
few_shot_toks = len(tokenizer.encode(
    dummy_prompt)) - len(tokenizer.encode(dummy))
print(f"Few-shot prompt has {few_shot_toks} tokens")

Few-shot prompt has 1878 tokens


In [None]:
# ✅ Generate prompts
prompts = []
for ex in tqdm(dataset, total=len(dataset), desc="Generating prompts"):
    code = ex["content"]
    toks = len(tokenizer.encode(code)) + few_shot_toks
    if toks > 16380:  # Starcoder2 models have 16384 context limit
        print(f"Skipping example with {toks} tokens")
        prompts.append(dummy_prompt)
        continue
    p = prompt_fmt(code)
    prompts.append(p)

# ✅ Generate answers using StarCoder2-15B on A100
responses = []

# On A100 40GB, batch_size ~ 16 is typically safe
batch_size = 2

sampling_params = SamplingParams(
    temperature=0.0,
    stop=["\n"],
    max_tokens=3  # Keep this low to reduce memory usage
)

for chunk in tqdm(chunkify(prompts, batch_size), desc="Generating responses"):
    outs = model.generate(chunk, sampling_params)
    contents = [o.outputs[0].text for o in outs]
    for c in contents:
        yes_count = c.lower().count("yes")
        no_count = c.lower().count("no")
        responses.append(yes_count > no_count)

Generating prompts: 100%|██████████| 1708/1708 [00:01<00:00, 1327.76it/s]
Generating responses:   0%|          | 0/854 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Generating responses:   0%|          | 0/854 [12:13<?, ?it/s]


KeyboardInterrupt: 

In [None]:
dataset

In [None]:
subset = dataset.select(range(75000))

In [None]:
subset

In [None]:
new_ds = subset.filter(  # horrible hack!
    lambda ex, i: responses[i] and "def dummy()" not in ex["content"], with_indices=True)
print(f"Filtered {len(dataset) - len(new_ds)} examples")

In [None]:
new_ds.save_to_disk("../datasets/seed3")

In [None]:
new_ds