In [1]:
from datasets import load_from_disk

data = load_from_disk("data")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data

Dataset({
    features: ['content', 'sha1', 'id'],
    num_rows: 17581
})

### SEED GATHERING GET CONTENT

In [None]:
from huggingface_hub import login

In [2]:
import tree_sitter_javascript as tsjs
from tree_sitter import Language, Parser

LANGUAGE = Language(tsjs.language())

import datasets
import os
import signal
from multiprocessing import Pool
import boto3
import smart_open
from botocore import UNSIGNED
from botocore.config import Config

In [3]:
# Set up S3 client
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)
    return content

In [4]:
# JavaScript-specific Tree-sitter query for top-level functions with docstrings
TOPLEVEL_DOCSTRING_QUERY = LANGUAGE.query("""
(
  (function_declaration
    name: (identifier)
    body: (statement_block . 
      (comment) @docstring)) @function.def
)
""")

def node_to_string(buf, node):
    return buf[node.start_byte:node.end_byte].decode("utf8", errors="ignore")

def get_fns_with_docstrings(src, tree):
    captures = TOPLEVEL_DOCSTRING_QUERY.captures(tree.root_node)
    res = []
    for node, ty in captures:
        if ty != "function.def":
            continue
        _, col = node.start_point
        if col != 0:
            continue
        res.append(node_to_string(src, node))
    return res


def parse_ex(parser, ex):
    try:
        ex = download_contents(ex["blob_id"], ex["src_encoding"])
        buf = bytes(ex, "utf8")
        tree = parser.parse(buf)
        return get_fns_with_docstrings(buf, tree)
    except Exception as e:
        print(f"Error parsing blob {ex.get('blob_id')}: {e}")
        return []

PARSERS = None

def process_chunk(idx_and_chunk):
    global PARSERS
    idx, chunk = idx_and_chunk
    parser = PARSERS[idx]
    chunk_new_funs = set()
    for ex in chunk:
        chunk_new_funs.update(parse_ex(parser, ex))
    return chunk_new_funs

def make_parser():
    parser = Parser(LANGUAGE)
    return parser

def main_js():
    global PARSERS
    ds = datasets.load_dataset(
        "bigcode/the-stack-v2-dedup",
        "JavaScript",
        streaming=True,
        split="train",
        cache_dir="./stack_cache"
    )

    funs = set()
    NUM_WORKERS = os.cpu_count()
    PARSERS = [make_parser() for _ in range(NUM_WORKERS)]
    CHUNK_SIZE = 1000 * NUM_WORKERS

    chunk = []
    p = Pool(NUM_WORKERS)

    i = 0
    for ex in ds:
        try:
            chunk.append(ex)
            if len(chunk) == CHUNK_SIZE:
                print(f"Processing chunk {i // CHUNK_SIZE}")
                subchunk_size = len(chunk) // NUM_WORKERS
                subchunks = [chunk[j:j + subchunk_size] for j in range(0, len(chunk), subchunk_size)]
                new_funs_iter = p.imap(process_chunk, [(j, subchunk) for j, subchunk in enumerate(subchunks)])

                print("Getting new functions")
                len_before = len(funs)

                while True:
                    try:
                        def timeout_handler(_, __):
                            raise KeyboardInterrupt
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)
                        funs.update(next(new_funs_iter))
                        signal.alarm(0)
                    except KeyboardInterrupt:
                        signal.alarm(0)
                        print("Timeout or keyboard interrupt. Restarting pool.")
                        p.terminate()
                        p.join()
                        p = Pool(NUM_WORKERS)
                        break
                    except StopIteration:
                        break
                    except Exception as e:
                        print(f"Error during pool iteration: {e}")

                PARSERS = [make_parser() for _ in range(NUM_WORKERS)]
                print(f"✅ Done processing chunk {i // CHUNK_SIZE}. Got {len(funs) - len_before} new functions.")
                chunk = []

            i += 1
            if i % 1000 == 0:
                print(f"Progress: {i} examples processed.")

        except Exception as e:
            print(f"Error on example {i}: {e}")
            chunk = []

        # Optional: early stop if needed
        # if i > 10000:
        #     break

    p.close()
    p.join()

    print(f"✅ Total JavaScript functions extracted: {len(funs)}")
    return funs

In [None]:
all_functions = main_js()

Resolving data files:   0%|          | 0/757 [00:00<?, ?it/s]

Progress: 1000 examples processed.
Progress: 2000 examples processed.
Progress: 3000 examples processed.
Progress: 4000 examples processed.
Progress: 5000 examples processed.
Progress: 6000 examples processed.
Progress: 7000 examples processed.
Processing chunk 0
Getting new functions


Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_chunk' on <module '__main__' (built-in)>
Process SpawnPoolWorker-2:
Traceback (most recent call last):
  Fil

Timeout or keyboard interrupt. Restarting pool.


Process SpawnPoolWorker-13:
Process SpawnPoolWorker-9:
Process SpawnPoolWorker-15:
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-14:
Process SpawnPoolWorker-16:
Process SpawnPoolWorker-12:
Process SpawnPoolWorker-11:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/poo

✅ Done processing chunk 0. Got 0 new functions.
Progress: 8000 examples processed.
Progress: 9000 examples processed.
Progress: 10000 examples processed.
Progress: 11000 examples processed.
Progress: 12000 examples processed.
Progress: 13000 examples processed.
Progress: 14000 examples processed.
Progress: 15000 examples processed.
Processing chunk 1
Getting new functions


Process SpawnPoolWorker-18:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_chunk' on <module '__main__' (built-in)>
Process SpawnPoolWorker-17:
Traceback (most recent call last):
  F

In [None]:
new_ds = datasets.Dataset.from_dict({
    "content": list(all_functions),
    "id": list(range(len(all_functions)))
})

In [None]:
ds = new_ds
ds

### SEED GATHERING HIGH-QUALITY SUBSET

In [None]:
import subprocess
import tempfile
import signal
import hashlib
import os
import argparse
from typing import List, Dict
from tqdm import tqdm
from tree_sitter_parser import LANGUAGE, global_parser

RETURN_QUERY = LANGUAGE.query("""
(return_statement) @return
""")

def does_have_return(src):
    tree = global_parser.parse(bytes(src, "utf8"))
    root = tree.root_node
    captures = RETURN_QUERY.captures(root)
    for node, _ in captures:
        # if it doesn't have an argument, it's not a return with a value
        if len(node.children) <= 1:  # includes "return" itself
            continue
        else:
            return True
    return False

# runs mypy in the given directory, returns stdout
# then, it logs the number of errors for each file
def run_mypy(d):
    try:
        outs = subprocess.run(
            ["mypy", "."],
            cwd=d,
            capture_output=True,
            timeout=120,
            text=True,
        ).stdout
    except Exception as e:
        print(e)
        return None

    filemap = {}
    lines = outs.split("\n")
    for line in lines:
        if line.strip():
            parts = line.split(":")
            if len(parts) >= 2:
                file = parts[0].split("/")[-1]
                if file not in filemap:
                    filemap[file] = 0
                if "error:" in line:
                    filemap[file] += 1

    return filemap

def typecheck_batch(files: List[str]) -> Dict[str, str]:
    # Create a temporary directory using the tempfile module
    filemap: Dict[str, str] = {}
    with tempfile.TemporaryDirectory() as tempdir:
        for contents in files:
            hash_object = hashlib.sha1(bytes(contents, "utf8"))
            hex_dig = hash_object.hexdigest()
            filemap[hex_dig] = contents
            name = os.path.join(tempdir, hex_dig + ".py")
            with open(name, "w") as f:
                f.write(contents)

        # Run mypy in the temporary directory
        typecheck_map = run_mypy(tempdir)
        print(typecheck_map)

        if typecheck_map is None:
            return {}

        for contents, errors in typecheck_map.items():
            no_py = contents.replace(".py", "")
            if errors == 0:
                continue
            if no_py in filemap:
                del filemap[no_py]

        print(f"Pass rate: {len(filemap)}/{len(files)}")
        return filemap

def infer_imports(code: str) -> str:
    import autoimport
    try:
        def handler(signum, frame):
            raise Exception("Timeout")
        signal.signal(signal.SIGALRM, handler)
        signal.alarm(10)
        inferred = autoimport.fix_code(code)
        signal.alarm(0)
        return inferred
    except Exception as e:
        signal.alarm(0)
        print(f"Error while inferring imports: {e}")
        return code

In [None]:
print("Filtering to only functions with return statements")
ds = ds.filter(lambda ex: does_have_return(
    ex["content"]), num_proc=os.cpu_count())


In [None]:
ds

In [None]:
# if args.infer_imports:
#     print("Inferring imports for functions")
#     ds = ds.map(lambda ex: {"content": infer_imports(
#         ex["content"])}, num_proc=os.cpu_count())

batch = []
max_i = len(ds) - 1

new_ds = {
    "content": [],
    "sha1": [],
    "id": [],
}

e_id = 0

for i, ex in enumerate(tqdm(ds, total=len(ds))):
    try:
        code = ex["content"]

        batch.append(code)

        if len(batch) == 250 or i == max_i:
            filemap = typecheck_batch(batch)
            for sha1, contents in filemap.items():
                new_ds["content"].append(contents)
                new_ds["sha1"].append(sha1)
                new_ds["id"].append(e_id)
                e_id += 1
            batch = []
            
    except Exception as e:
        print(f"There was an error: {e}")
        continue

new_ds_hf = datasets.Dataset.from_dict(new_ds)

In [None]:
print(new_ds_hf['content'][0])

In [None]:
save_dir = "../datasets/seed2"

In [None]:
new_ds_hf.save_to_disk(save_dir)

### SEED GATHERING FILTER DATASET

In [None]:
import datasets
import os
from tree_sitter_parser import global_parser, LANGUAGE, does_have_return, make_parser
import benchmark_data
from tqdm import tqdm
import torch
import argparse
from vllm import LLM, SamplingParams
import random

In [None]:
FN_BLOCK_QUERY = LANGUAGE.query("""
(function_definition
  body: (block) @fn-block)
""")


def template_few_shot(code, answer, rationale):
    doc, code = js_extract_docstring(code)  # Update to extract JavaScript comments
    prompt = f"""
<issue_start>username_0: I have a function in JavaScript and I'd like someone to check my description of this function.
I'm doing this so that I can write a good comment for this function.

Here is the code for the function:
```js
{code}
```

Here is my description of this program:
```
{doc}
```

Do not attempt to execute the function or to judge its correctness.
Answer with "Yes" or "No" depending on if my description has enough information alone to re-implement the function.
Also, answer with "No" if the description does not match the function.<issue_comment>username_1: Sure, no problem. I will be able to help.
My answer is: {answer}

{rationale}

Upvotes: 200"""
    return prompt


FEW_SHOTS = [
    (
        '''def simple_scan_network():
    """
    Do a simple network scan, which only works if your network configuration
    is 192.168.1.x
    """
    base_ip = "192.168.1."
    addresses = ['127.0.0.1']

    for index in range(1, 255):
        addresses.extend([base_ip + str(index)])

    return addresses''',
        "No",
        "The simple_scan_network function you have provided seems to generate addresses that then would be used for a network scan, but does not actually perform it, unlike the function claims.",
    ),
    (
        '''import pandas


def coerce_integer(df):
    """
    Loop through the columns of a df, if it is numeric,
    convert it to integer and fill nans with zeros.
    This is somewhat heavy-handed in an attempt to force
    Esri to recognize sparse columns as integers.
    """
    # Numeric columns to not coerce to integer
    EXCEPT = ["latitude", "longitude", "zipCode"]

    def numeric_column_to_int(series):
        return (
            series.fillna(0).astype(int)
            if pandas.api.types.is_numeric_dtype(series) and series.name not in EXCEPT
            else series
        )

    return df.transform(numeric_column_to_int, axis=0)''',
        "Yes",
        "The docstring does seem to match the implementation! The function loops through the columns of a df and coerces it as explained.",
    ),
    ('''def __trans_df_into_dict(data):
    """Converte DataFrame to dictionary.

    Args:
        data (pandas.DataFrame): DataFrame.

    Returns:
        dict: Name dictionary.
    """
    data["en_name"] = data["en_name"].str.upper()
    data["en_name_f"] = data["en_name"].str.split(" ", expand=True)[0]
    data["en_name_l"] = data["en_name"].str.split(" ", expand=True)[1]
    data["jp_name_f"] = data["jp_name"].str.split("・", expand=True)[0]
    data["jp_name_l"] = data["jp_name"].str.split("・", expand=True)[1]
    fullname_dict = dict(zip(data["en_name"], data["jp_name"]))
    fname_dict = dict(zip(data["en_name_f"], data["jp_name_f"]))
    lname_dict = dict(zip(data["en_name_l"], data["jp_name_l"]))
    return fullname_dict, fname_dict, lname_dict''',
     "No",
     "The function__trans_df_into_dict  does indeed convert a dataframe into a dictionary, however, it converts various columns that were not described in the docstring.\nFor instance, nowhere in the docstring it mentions handling japanese characters or the name of the column.",
     ),
    (
        '''def inchesToMeters(inches):
    """Convert inches to meters."""
    return inches * 0.0254''',
        "Yes",
        "inchesToMeters is a very simple function, the doccstring explains concisely its purpose, which is of converting inches to meters.",
    ),
    ('''def square_crop(im, target_size=None):
  """ Crop image to `target_size`. If that's None the image is squared
  to the smallest size
  """

  w = im.size[0]
  h = im.size[1]

  target_size = target_size if target_size else min(w, h)

  dx = (w - target_size) / 2
  dy = (h - target_size) / 2

  return im.crop((dx, dy, dx + target_size, dy + target_size))''',
     "Yes",
     "Following the standard description for docstrings for functions and methods, the square_crop function description tells exactly what the function does."
     ),
    ('''def _setup_motifs_files(args):
    """convenience fn, make sure setup is same across
    multiplicity/orientation/spacing workflows
    """
    motifs_files = {}
    motifs_files["early"] = "{}/{}/ggr.scanmotifs.h5".format(
        args.inputs["inference"][args.cluster]["scanmotifs_dir"],
        args.inputs["inference"][args.cluster]["scanmotifs_early_dir"])
    motifs_files["mid"] = "{}/{}/ggr.scanmotifs.h5".format(
        args.inputs["inference"][args.cluster]["scanmotifs_dir"],
        args.inputs["inference"][args.cluster]["scanmotifs_mid_dir"])
    motifs_files["late"] = "{}/{}/ggr.scanmotifs.h5".format(
        args.inputs["inference"][args.cluster]["scanmotifs_dir"],
        args.inputs["inference"][args.cluster]["scanmotifs_late_dir"])

    return motifs_files''',
     "No",
     "The docstring for _setup_motifs_files just says this is a convenience function. There is definitely not enough information to re-implement this function from the docstring alone.",
     ),
    ('''def trip(u, v):
    """
    Returns the scalar triple product of vectors u and v and z axis.
    The convention is z dot (u cross v). Dotting with the z axis simplifies
    it to the z component of the u cross v
    The product is:
        positive if v is to the left of u, that is,
          the shortest right hand rotation from u to v is ccw
        negative if v is to the right of u, that is,
          the shortest right hand rotation from u to v is cw
        zero if v is colinear with u
    Essentially trip is the z component of the cross product of u x v
    """
    return (u[0] * v[1] - u[1] * v[0])''',
     "Yes",
     "The docstring for the trip function is very detailed and describes the function's purpose and the mathematical formula used to calculate the scalar triple product.",
     )
]


def prompt_fmt(code):
    doc, code = py_extract_docstring(code)
    random.shuffle(FEW_SHOTS)
    buf = ""
    for few in FEW_SHOTS:
        buf += template_few_shot(*few)
    buf += f"""<issue_start>username_0: I have a function in Python and I'd like someone to check my description of this function.
I'm doing this so that I can write a good docstring for this function.

Here is the code for the function:
```py
{code}
```

Here is my description of this program:
```
{doc}
```

Do not attempt to execute the function or to judge its correctness.
Answer with "Yes" or "No" depending on if my description has enough information alone to re-implement the function.
Also, answer with "No" if the description does not match the function.
Upvotes: 100<issue_comment>username_1: Sure, no problem. I will be able to help.
My answer is:"""
    return buf


def auto_dtype():
    if torch.cuda.is_bf16_supported():
        return "bfloat16"
    return "auto"


def chunkify(lst, n):
    chunks = []
    for i in range(0, len(lst), n):
        chunk = []
        for j in range(n):
            if i + j < len(lst):
                chunk.append(lst[i + j])
        chunks.append(chunk)
    return chunks


In [None]:
dataset = new_ds_hf

In [None]:
print(f"Loaded {len(dataset)} examples. Running pre-filtering...")

BAD_WORDS = ["todo", "fixme", "bug"]
BAD_IMPORTS = ["argparse", "os", "subprocess", "sys", "setuptools",
               "distutils", "matplotlib", "seaborn"]
BAD_IMPORTS = [f"import {b}" for b in BAD_IMPORTS] + \
    [f"from {b}" for b in BAD_IMPORTS]
BAD_SUBSTRINGS = BAD_WORDS + BAD_IMPORTS

bench_filter = benchmark_data.filter_out()
all_bench = bench_filter["human_eval_docstrings"] + \
    bench_filter["human_eval_solutions"] + \
    bench_filter["mbpp_docstrings"] + \
    bench_filter["mbpp_solutions"]

In [None]:
def pre_filtering(ex):
    code = ex["content"]
    code_bytes = code.encode('utf-8')
    
    # Filter out functions without arguments
    if "function()" in code or "() =>" in code:
        return False

    # filter out bad substrings
    lower = code.lower()
    for word in BAD_SUBSTRINGS:
        if word in lower:
            return False

    for b in all_bench:
        if b in code:  # contaminated sample!
            return False

    # too many lines of code -- say 150
    lines = code.split("\n")
    if len(lines) > 150:
        return False

    # filter functions which don't have an argument
    # 1. find first def statement in lines
    # 2. check if contains ():
    for line in lines:
        if line.startswith("def ") and "():" in line:
            return False

    # filter out functions with no return statement
    parser = make_parser()
    if not does_have_return(code, parser=parser):
        return False

    try:
        tree = global_parser.parse(code_bytes)
        block, _ = FN_BLOCK_QUERY.captures(tree.root_node)[0]

        # get the docstring, filter if not a docstring
        exp = block.children[0]
        if not exp.type == 'expression_statement' and not exp.children[0].type == 'string':
            return False

        docstring = exp.children[0]
        docstring_text = docstring.text.decode('utf-8')
        if not docstring_text.startswith('"""') and not docstring_text.endswith('"""'):
            return False
    except Exception as e:
        print(f"Error in filtering: {e}")
        return False

    return True  # all good!


threads = os.cpu_count() - 1  # type: ignore
dataset = dataset.filter(pre_filtering, num_proc=threads)

In [None]:
dataset

In [None]:
model = LLM(f"../../../StarCoder", dtype=auto_dtype(),
            gpu_memory_utilization=0.95, tensor_parallel_size=1)


In [None]:
tokenizer = model.get_tokenizer()

In [None]:
print(f"Now running stage 3 filtering on {len(dataset)} examples...")

In [None]:
def unindent(s):
    lines = s.splitlines()
    non_blank_lines = [line for line in lines if line.strip()]
    min_indent = min(len(line) - len(line.lstrip())
                     for line in non_blank_lines) if non_blank_lines else 0
    unindented_lines = [line[min_indent:] if len(
        line) >= min_indent else line for line in lines]
    return '\n'.join(unindented_lines)


def py_extract_docstring(code):
    first_doc = code.find('"""')
    assert first_doc != -1
    first_doc = first_doc + 3
    second_doc = code[first_doc+1:].find('"""')
    assert second_doc != -1
    second_doc = second_doc + first_doc + 1
    doc = code[first_doc:second_doc]
    doc = unindent(doc).strip()
    code = code[:first_doc-3] + code[second_doc+3:]
    return doc, code



In [None]:
dummy = 'def dummy(): \n    """\n    """\n pass'
dummy_prompt = prompt_fmt(dummy)
few_shot_toks = len(tokenizer.encode(
    dummy_prompt)) - len(tokenizer.encode(dummy))
print(f"Few-shot prompt has {few_shot_toks} tokens")

In [None]:
prompts = []
for ex in tqdm(dataset, total=len(dataset), desc="Generating prompts"):
    code = ex["content"]
    toks = len(tokenizer.encode(code)) + few_shot_toks
    if toks > 16380:
        print(f"Skipping example with {toks} tokens")
        # to skip, just add dummy prompt
        prompts.append(dummy_prompt)
        continue
    p = prompt_fmt(code)
    prompts.append(p)

responses = []
for chunk in tqdm(chunkify(prompts, 512), desc="Generating responses"):
    outs = model.generate(chunk, SamplingParams(
        temperature=0.0, stop="\n", max_tokens=5))
    contents = [o.outputs[0].text for o in outs]
    for c in contents:
        yes_count = c.lower().count("yes")
        no_count = c.lower().count("no")
        if yes_count > no_count:
            responses.append(True)
        elif yes_count < no_count:
            responses.append(False)
        else:
            # default to No
            responses.append(False)



In [None]:
dataset

In [None]:
subset = dataset.select(range(75000))

In [None]:
subset

In [None]:
new_ds = subset.filter(  # horrible hack!
    lambda ex, i: responses[i] and "def dummy()" not in ex["content"], with_indices=True)
print(f"Filtered {len(dataset) - len(new_ds)} examples")

In [None]:
new_ds.save_to_disk("../datasets/seed3")

In [None]:
new_ds

In [1]:
import shutil

# Parameters
folder_to_zip = '/Users/vaibhavgupta/Desktop/LLM_Training_JS/data'    # the folder you want to zip
archive_name  = 'data'           # output ZIP will be my_archive.zip

# Create the ZIP
shutil.make_archive(archive_name, 'zip', folder_to_zip)

print(f"Created {archive_name}.zip")

Created data.zip
