In [1]:
from pathlib import Path
import os
import shutil

repo_url = "https://github.com/ROIM1998/APT.git"
local_path = "codebase"

# Clone the repo if not already done
if not os.path.exists(local_path):
    os.system(f"git clone {repo_url} {local_path}")


In [2]:
!pip install transformers



In [3]:
from pathlib import Path

def load_code_files(root_dir, extensions={".py", ".ipynb", ".md", ".txt"}):
    code_files = []
    for file_path in Path(root_dir).rglob("*"):
        if file_path.suffix in extensions:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                code = f.read()
                code_files.append({"path": str(file_path), "content": code})
    return code_files

documents = load_code_files(local_path)


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\nclass ", "\ndef ", "\n", " ", ""]
)

docs_with_chunks = []
for doc in documents:
    chunks = splitter.split_text(doc["content"])
    for i, chunk in enumerate(chunks):
        docs_with_chunks.append({
            "content": chunk,
            "metadata": {"source": doc["path"], "chunk_id": i}
        })


In [20]:
from transformers import AutoTokenizer

# 1) Load your LLaMA (or other) tokenizer
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# 2) Inspect its context‚Äêwindow size
max_tokens = tokenizer.model_max_length
print(f"Model context window = {max_tokens} tokens")

# 3) For each chunk, count tokens
too_long = []
tot = 0
for doc in docs_with_chunks:
    text = doc["content"]
    tokenized = tokenizer(text, add_special_tokens=False)
    length = len(tokenized["input_ids"])
    tot += length
    if length > max_tokens:
        too_long.append((doc["metadata"]["source"], doc["metadata"]["chunk_id"], length))
print("Total chunks ", len(docs_with_chunks))
print("Total tokens ", tot)
print(f"{len(too_long)} chunks exceed the context window.")
for src, cid, ln in too_long[:5]:
    print(f"  ‚Äì {src} [chunk {cid}] is {ln} tokens (> {max_tokens})")


Model context window = 131072 tokens
Total chunks  2749
Total tokens  536921
0 chunks exceed the context window.


In [5]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.3/31.3 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [14]:
import torch
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 1) Pick device explicitly
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2) Load the model on that device
embedder = SentenceTransformer("microsoft/codebert-base", device=device)

# 3) Generate embeddings
embeddings = embedder.encode(
    [d["content"] for d in docs_with_chunks],
    show_progress_bar=True
)

# 4) Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))





Batches:   0%|          | 0/86 [00:00<?, ?it/s]

In [7]:
id_to_metadata = {i: docs_with_chunks[i]["metadata"] for i in range(len(docs_with_chunks))}


In [29]:
def semantic_search(query, top_k=5):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [(id_to_metadata[i], docs_with_chunks[i]["content"]) for i in indices[0]]


In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def load_model(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")

    llm = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2048)
    return llm


In [10]:
def make_prompt(rubric_item: str, code_chunk: str):
    return f"""You are a research assistant helping verify whether parts of a codebase fulfill expected contributions.

Contribution to verify:
{rubric_item}

Here is a chunk of the code:
```python
{code_chunk}
"""

In [11]:

### Step 3: Run the Reasoning Loop

def analyze_with_llama(rubric_item, top_chunks, n_results=3):
    for i, (meta, chunk) in enumerate(top_chunks[:n_results]):
        prompt = make_prompt(rubric_item, chunk)
        print(f"\nüîç [File: {meta['source']} | Chunk {meta['chunk_id']}]")
        result = llm(prompt)[0]['generated_text']
        print(result)


In [40]:
import json
from pathlib import Path
from typing import Any, Dict, List, Tuple

import faiss
import numpy as np
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

all_results = []

from pathlib import Path
import json
from typing import List, Any, Dict

def load_rubric(rubric_path: Path) -> List[str]:
    """
    Load rubric items from a JSON file whose top level is a list of objects,
    each with a "requirements" field and possibly nested `sub_tasks`.
    Returns a flat list of all requirement strings.
    """
    def collect_requirements(item: Dict[str, Any], out: List[str]) -> None:
        # Add this item's own requirement
        req = item.get("requirements")
        if isinstance(req, str):
            out.append(req)
        # Recurse into any sub_tasks
        for sub in item.get("sub_tasks", []):
            collect_requirements(sub, out)

    raw = json.loads(rubric_path.read_text(encoding="utf-8"))
    if not isinstance(raw, list):
        raise ValueError(f"{rubric_path!r} should contain a top-level JSON array")

    all_reqs: List[str] = []
    for entry in raw:
        if not isinstance(entry, dict):
            raise ValueError("Each top-level element must be an object with a 'requirements' key")
        collect_requirements(entry, all_reqs)

    return all_reqs




def batched_analysis(rubric_items: List[str], llm: pipeline, top_k: int) -> List[Dict[str, Any]]:
    # Prepare records
    records: List[Dict[str, Any]] = []
    for rubric in rubric_items:
        chunks = semantic_search(rubric, top_k=top_k)
        for meta, chunk in chunks:
            prompt = make_prompt(rubric, chunk)
            records.append({
                "rubric": rubric,
                "file": meta["source"],
                "chunk_id": meta["chunk_id"],
                "prompt": prompt,
            })

    # Build HF Dataset
    ds = Dataset.from_list(records)

    def generate_batch(batch):
        outputs = llm(batch["prompt"])
        batch["response"] = [o["generated_text"].strip() for o in outputs]
        return batch

    result_ds = ds.map(generate_batch, batched=True, batch_size=16)

    # Collect results
    results: List[Dict[str, Any]] = []
    for item in result_ds:
        results.append({
            "rubric": item["rubric"],
            "file": item["file"],
            "chunk_id": item["chunk_id"],
            "llm_response": item["response"],
        })
    return results



In [41]:
RUBRIC_PATH = Path("rubric_apt.json")
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
TOP_K = 5
OUTPUT_PATH = Path("rubric_mapping.json")

# Main execution without CLI arguments
def main() -> None:
    rubric_items = load_rubric(RUBRIC_PATH)
    print(f"‚úÖ Loaded {len(rubric_items)} rubric items from '{RUBRIC_PATH}'")

    print(f"üöÄ Loading LLaMA model: {MODEL_ID}")
    llm = load_model(MODEL_ID)

    # Run batched analysis
    all_results = batched_analysis(rubric_items, llm, top_k=TOP_K)

    # Save
    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2)
    print(f"üéâ Batched analysis complete. Results saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

‚úÖ Loaded 172 rubric items from 'rubric_apt.json'
üöÄ Loading LLaMA model: meta-llama/Llama-3.2-3B-Instruct


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


Map:   0%|          | 0/860 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 