In [None]:
!pip install -q transformers datasets

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os
import re
# --------------------
# 1) Load the LLaMA model
# --------------------
model_name = "meta-llama/Llama-3.1-8B-Instruct" 
hf_token = "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"  

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    token=hf_token,
)

In [None]:
# --------------------
# 2) List your JSON files
# --------------------
json_files = [
    # "/kaggle/input/sample-data/sample.jsonl",
    # "/kaggle/input/sample-pairs/T1_300.jsonl",
    # "/kaggle/input/sample-pairs/T2_300.jsonl",
    "/kaggle/input/sample-pairs/VST3_300.jsonl",
    # "/kaggle/input/sample-pairs/ST3_300.jsonl",
    # "/kaggle/input/sample-pairs/MT3_300.jsonl",
    # "/kaggle/input/sample-pairs/WT3_T4_300.jsonl",
]

output_dir = "/kaggle/working/summaries"
os.makedirs(output_dir, exist_ok=True)

# --------------------
# 3) Enhanced extraction function
# --------------------
def extract_json_from_tags(text: str) -> dict:
    """Robust JSON extraction with multiple fallback strategies"""
    # Strategy 1: Exact <json> tag match
    match = re.search(r"<json>\s*(\{.*?\})\s*</json>", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except:
            pass
    
    # Strategy 2: Find first JSON-like structure
    json_match = re.search(r"\{[\s\S]*\}", text)
    if json_match:
        try:
            return json.loads(json_match.group(0))
        except:
            pass
    
    # Strategy 3: Return raw text as fallback
    return {"summary1": text, "summary2": text}

# --------------------
# 4) Optimized processing
# --------------------
for file_path in json_files:
    print(f"Processing: {file_path}")
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f if line.strip()]

    results = []
    for idx, item in enumerate(data):
        print(f"\n🔹 Processing pair {idx + 1}/{len(data)}")
        code1, code2 = item["code1"], item["code2"]

        # Optimized prompt with strict formatting instructions
        prompt = (
            f"Write two separate 2-3 line summaries for these Java methods:\n\n"
            f"METHOD 1 (ID:{item['method1_id']}):\n{code1}\n\n"
            f"METHOD 2 (ID:{item['method2_id']}):\n{code2}\n\n"
            "Return ONLY JSON with exactly these two fields:\n"
            "{\"summary1\": \"summary text here\", \"summary2\": \"summary text here\"}\n"
            "Wrap in <json> tags. No explanations or extra text."
        )

        try:
            # Tokenize and generate
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False,
                repetition_penalty=1.1,
                eos_token_id=tokenizer.eos_token_id,
            )
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            
            # Remove prompt echo
            if decoded.startswith(prompt):
                decoded = decoded[len(prompt):].strip()

            # Extract JSON with robust fallback
            parsed = extract_json_from_tags(decoded)
            summary1 = parsed.get("summary1", "Extraction Error").strip()
            summary2 = parsed.get("summary2", "Extraction Error").strip()
            
            # Final cleanup of artifacts
            for s in [summary1, summary2]:
                if "**" in s or "```" in s:
                    s = re.sub(r"\*{2,}|`{3,}", "", s)

        except torch.cuda.OutOfMemoryError:
            print("🚨 OOM encountered, skipping pair.")
            torch.cuda.empty_cache()
            summary1, summary2 = "OOM Error", "OOM Error"
        
        results.append({
            "clone_type": item["clone_type"],
            "method1_id": item["method1_id"],
            "method2_id": item["method2_id"],
            "summary1": summary1,
            "summary2": summary2,
        })

    # Save results
    out_file = os.path.join(output_dir, os.path.basename(file_path).replace(".jsonl", "_summaries.json"))
    with open(out_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"✅ Saved summaries to: {out_file}")

print("\n🎉 All files processed successfully!")

In [None]:
!pip install -q sentence-transformers scikit-learn

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import os

# --------------------
# 1) Configuration
# --------------------
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'  # Lightweight but effective model
SIMILARITY_THRESHOLD = 0.6            # Adjust based on your validation needs
summary_dir = "/kaggle/working/summaries"  # Where your summaries are saved
results_dir = "/kaggle/working/results"    # Where to save final results

os.makedirs(results_dir, exist_ok=True)

# --------------------
# 2) Load embedding model
# --------------------
print("⏳ Loading embedding model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
print("✅ Embedding model loaded!")

In [40]:
# --------------------
# 3) Process each summary file
# --------------------
summary_files = [
    # "T1_300_summaries.json",
    "T2_300_summaries.json",
    # "VST3_300_summaries.json",
    # "ST3_300_summaries.json",
    # "MT3_300_summaries.json",
    # "WT3_T4_300_summaries.json"
]

for file_name in summary_files:
    print(f"\n{'='*50}")
    print(f"Processing: {file_name}")
    print(f"{'='*50}")
    
    file_path = os.path.join(summary_dir, file_name)
    
    # Load summary data
    with open(file_path, "r") as f:
        summary_data = json.load(f)
    
    # Prepare texts for batch embedding
    all_texts = []
    for item in summary_data:
        all_texts.append(item["summary1"])
        all_texts.append(item["summary2"])
    
    # Generate embeddings in batch
    print(f"🔧 Generating embeddings for {len(all_texts)} summaries...")
    embeddings = embedding_model.encode(all_texts, 
                                       batch_size=128,
                                       show_progress_bar=True,
                                       convert_to_numpy=True)
    
    # Process each pair
    results = []
    for i, item in enumerate(summary_data):
        # Get corresponding embeddings
        idx = i * 2
        emb1 = embeddings[idx]
        emb2 = embeddings[idx + 1]
        
        # Handle OOM errors from previous step
        if "OOM Error" in [item["summary1"], item["summary2"]]:
            similarity = 0.0
            is_clone = False
        else:
            # Calculate cosine similarity
            similarity = cosine_similarity([emb1], [emb2])[0][0]
            is_clone = bool(similarity > SIMILARITY_THRESHOLD)
        
        results.append({
            "clone_type": item["clone_type"],
            "method1_id": item["method1_id"],
            "method2_id": item["method2_id"],
            "summary1": item["summary1"],
            "summary2": item["summary2"],
            "embedding1": emb1.tolist(),  # Convert numpy array to list
            "embedding2": emb2.tolist(),
            "cosine_similarity": float(similarity),
            "is_clone_predicted": is_clone
        })
    
    # Save results
    out_file = os.path.join(results_dir, file_name.replace("_summaries", "_results"))
    with open(out_file, "w") as f:
        json.dump(results, f, indent=2)
    
    print(f"✅ Saved results for {len(results)} pairs to: {out_file}")
    print(f"📊 Clone detection stats:")
    print(f"   - Predicted clones: {sum(1 for r in results if r['is_clone_predicted'])}")
    print(f"   - Predicted non-clones: {sum(1 for r in results if not r['is_clone_predicted'])}")
    print(f"   - Average similarity: {np.mean([r['cosine_similarity'] for r in results]):.4f}")

print("\n🎉 All files processed successfully!")


Processing: T2_300_summaries.json
🔧 Generating embeddings for 600 summaries...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Saved results for 300 pairs to: /kaggle/working/results/T2_300_results.json
📊 Clone detection stats:
   - Predicted clones: 295
   - Predicted non-clones: 5
   - Average similarity: 0.9512

🎉 All files processed successfully!
