In [None]:
# ===== CELL 1: Setup =====
!pip install -q rouge==1.0.1 code-tokenize==0.2.0 words2num==0.2.0 regex astor
!python -m spacy download en_core_web_lg

import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

import os
import sys
import pandas as pd

CODE_DIR = '/kaggle/input/exploitgen-code'
DATA_DIR = '/kaggle/input/exploitgen-data'
sys.path.insert(0, CODE_DIR)

print("✓ Setup complete!")

In [None]:
# ============================================================================
# CELL 2: FIX utils/__init__.py - TẠO FILE MỚI
# ============================================================================

# Tạo utils package mới với logic đúng
!mkdir -p /kaggle/working/utils_fixed

UTILS_FIXED = """
import random
import numpy as np
import torch
import pandas as pd


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


class Example:
    def __init__(self, idx, source, similarity, target):
        self.idx = idx
        self.source = source
        self.similarity = similarity
        self.target = target


class InputFeatures:
    def __init__(self, example_id, source_ids, source_mask, similarity_ids, 
                 similarity_mask, target_ids, target_mask):
        self.example_id = example_id
        self.source_ids = source_ids
        self.source_mask = source_mask
        self.similarity_ids = similarity_ids
        self.similarity_mask = similarity_mask
        self.target_ids = target_ids
        self.target_mask = target_mask


def read_examples(filename: str, stage: str = 'stage2') -> list:
    \"\"\"
    ĐỌC DATA ĐÚNG THEO STAGE!
    
    stage='stage1_raw': raw_nl -> raw_code (Raw Encoder)
    stage='stage1_temp': temp_nl -> temp_code (Template Encoder)  
    stage='stage2': raw_nl + temp_nl -> temp_code (Full Model)
    \"\"\"
    examples = []
    df = pd.read_csv(filename)
    
    for idx, row in df.iterrows():
        if stage == 'stage1_raw':
            # Stage 1A: Train Raw Encoder
            examples.append(Example(
                idx=idx,
                source=str(row['raw_nl']).strip(),
                similarity=str(row['raw_nl']).strip(),  # Không dùng template
                target=str(row['raw_code']).strip()     # Target là RAW code
            ))
        elif stage == 'stage1_temp':
            # Stage 1B: Train Template Encoder
            examples.append(Example(
                idx=idx,
                source=str(row['temp_nl']).strip(),     # Template NL
                similarity=str(row['temp_nl']).strip(), # Không dùng dual
                target=str(row['temp_code']).strip()    # Target là TEMPLATE code
            ))
        else:  # stage2
            # Stage 2: Full Model với Dual Encoders
            examples.append(Example(
                idx=idx,
                source=str(row['raw_nl']).strip(),      # Raw NL (Raw Encoder)
                similarity=str(row['temp_nl']).strip(), # Template NL (Template Encoder)
                target=str(row['temp_code']).strip()    # Target là TEMPLATE code
            ))
    
    return examples


def convert_examples_to_features(examples, tokenizer, max_source_length, 
                                 max_target_length, stage='train'):
    features = []
    
    for example_idx, example in enumerate(examples):
        # Tokenize source
        source_tokens = tokenizer.tokenize(example.source)[:max_source_length - 2]
        source_tokens = [tokenizer.cls_token] + source_tokens + [tokenizer.sep_token]
        source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
        source_mask = [1] * len(source_ids)
        padding_length = max_source_length - len(source_ids)
        source_ids += [tokenizer.pad_token_id] * padding_length
        source_mask += [0] * padding_length
        
        # Tokenize similarity
        similarity_tokens = tokenizer.tokenize(example.similarity)[:max_source_length - 2]
        similarity_tokens = [tokenizer.cls_token] + similarity_tokens + [tokenizer.sep_token]
        similarity_ids = tokenizer.convert_tokens_to_ids(similarity_tokens)
        similarity_mask = [1] * len(similarity_ids)
        padding_length = max_source_length - len(similarity_ids)
        similarity_ids += [tokenizer.pad_token_id] * padding_length
        similarity_mask += [0] * padding_length
        
        # Tokenize target
        if stage == 'test':
            target_ids = [0]
            target_mask = [0]
        else:
            target_tokens = tokenizer.tokenize(example.target)[:max_target_length - 2]
            target_tokens = [tokenizer.cls_token] + target_tokens + [tokenizer.sep_token]
            target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
            target_mask = [1] * len(target_ids)
            padding_length = max_target_length - len(target_ids)
            target_ids += [tokenizer.pad_token_id] * padding_length
            target_mask += [0] * padding_length
        
        # Debug: In KHÔNG quá nhiều log
        if example_idx == 0:
            print(f"\\n*** Sample {stage} ***")
            print(f"source: {example.source[:50]}...")
            print(f"similarity: {example.similarity[:50]}...")
            print(f"target: {example.target[:50]}...")
        
        features.append(InputFeatures(
            example_id=example.idx,
            source_ids=source_ids,
            source_mask=source_mask,
            similarity_ids=similarity_ids,
            similarity_mask=similarity_mask,
            target_ids=target_ids,
            target_mask=target_mask
        ))
    
    return features


__all__ = [
    'set_seed',
    'Example',
    'InputFeatures', 
    'read_examples',
    'convert_examples_to_features'
]
"""

with open('/kaggle/working/utils_fixed/__init__.py', 'w') as f:
    f.write(UTILS_FIXED)

# Copy eval.py từ code gốc
!cp {CODE_DIR}/utils/eval.py /kaggle/working/utils_fixed/

print("✓ Fixed utils created")

In [None]:
# ============================================================================
# CELL 3: DAPT 
# ============================================================================
print("\n" + "="*80)
print("STAGE: DAPT")
print("="*80)

from transformers import (
    RobertaForMaskedLM, 
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

spoc_df = pd.read_csv(f'{DATA_DIR}/spoc/spoc-train.tsv', sep='\t')
spoc_df = spoc_df.dropna(subset=['text', 'code'])

texts = []
for _, row in spoc_df.iterrows():
    texts.append(f"{row['text']} </s> {row['code']}")

print(f"✓ {len(texts)} DAPT samples")

tokenizer = RobertaTokenizerFast.from_pretrained('microsoft/codebert-base')
dataset = Dataset.from_dict({"text": texts})

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        max_length=128,
        padding='max_length',
        truncation=True
    )

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base')

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir='/kaggle/working/dapt-model',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    max_steps=30000,
    save_steps=10000,
    save_total_limit=1,
    learning_rate=4e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_steps=500,
    fp16=True,
    dataloader_num_workers=2,
    report_to='none',
    disable_tqdm=False,  # Giữ progress bar
    logging_first_step=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized
)

print("Training DAPT...")
trainer.train()

DAPT_OUTPUT = '/kaggle/working/dapt-model'
trainer.save_model(DAPT_OUTPUT)
tokenizer.save_pretrained(DAPT_OUTPUT)
print(f"✓ DAPT saved")


In [None]:
# ============================================================================
# CELL 4: TAPT 
# ============================================================================
print("\n" + "="*80)
print("STAGE: TAPT")
print("="*80)

asm_train = pd.read_csv(f'{DATA_DIR}/assembly/train.csv')
asm_dev = pd.read_csv(f'{DATA_DIR}/assembly/dev.csv')
py_train = pd.read_csv(f'{DATA_DIR}/python/train.csv')
py_dev = pd.read_csv(f'{DATA_DIR}/python/dev.csv')

all_data = pd.concat([asm_train, asm_dev, py_train, py_dev], ignore_index=True)

texts = []
for _, row in all_data.iterrows():
    texts.append(f"{row['raw_nl']} </s> {row['raw_code']}")
    texts.append(f"{row['raw_nl']} </s> {row['temp_code']}")
    texts.append(f"{row['temp_nl']} </s> {row['temp_code']}")

print(f"✓ {len(texts)} TAPT samples")

dataset = Dataset.from_dict({"text": texts})
tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

model = RobertaForMaskedLM.from_pretrained(DAPT_OUTPUT)

training_args = TrainingArguments(
    output_dir='/kaggle/working/fg-codebert',
    overwrite_output_dir=True,
    num_train_epochs=25,
    per_device_train_batch_size=32,
    save_steps=5000,
    save_total_limit=1,
    learning_rate=4e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=200,
    fp16=True,
    dataloader_num_workers=2,
    report_to='none',
    disable_tqdm=False,
    logging_first_step=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized
)

print("Training TAPT...")
trainer.train()

TAPT_OUTPUT = '/kaggle/working/fg-codebert'
trainer.save_model(TAPT_OUTPUT)
tokenizer.save_pretrained(TAPT_OUTPUT)
print(f"✓ FG-CodeBERT saved")

In [None]:
# ============================================================================
# CELL 5: Stage 1 & 2 Training 
# ============================================================================
print("\n" + "="*80)
print("STAGE 1 & 2: Training (FIXED)")
print("="*80)

# SỬ DỤNG UTILS FIXED
sys.path.insert(0, '/kaggle/working')
from utils_fixed import set_seed

# Import model từ code gốc (giữ nguyên)
os.chdir(CODE_DIR)
from model import CodeBert_Seq2Seq

set_seed(42)

LANGUAGE = 'assembly'

if LANGUAGE == 'assembly':
    TRAIN_DATA = f'{DATA_DIR}/assembly/train.csv'
    DEV_DATA = f'{DATA_DIR}/assembly/dev.csv'
    TEST_DATA = f'{DATA_DIR}/assembly/test.csv'
else:
    TRAIN_DATA = f'{DATA_DIR}/python/train.csv'
    DEV_DATA = f'{DATA_DIR}/python/dev.csv'
    TEST_DATA = f'{DATA_DIR}/python/test.csv'

OUTPUT_STAGE1_RAW = '/kaggle/working/stage1-raw-encoder'
OUTPUT_STAGE1_TEMP = '/kaggle/working/stage1-temp-encoder'
OUTPUT_FINAL = '/kaggle/working/exploitgen-final'

os.makedirs(OUTPUT_STAGE1_RAW, exist_ok=True)
os.makedirs(OUTPUT_STAGE1_TEMP, exist_ok=True)
os.makedirs(OUTPUT_FINAL, exist_ok=True)

# ===== STAGE 1A: Raw Encoder (FIXED) =====
print("\n[STAGE 1A] Training Raw Encoder (raw_nl -> raw_code)")

# PATCH model.py để dùng utils_fixed
import model as model_module
model_module.read_examples = lambda f: __import__('utils_fixed').read_examples(f, stage='stage1_raw')
model_module.convert_examples_to_features = __import__('utils_fixed').convert_examples_to_features

model_raw = CodeBert_Seq2Seq(
    ip_path=TAPT_OUTPUT,
    raw_path=TAPT_OUTPUT,
    decoder_layers=6,
    fix_encoder=False,
    beam_size=10,
    max_source_length=64,
    max_target_length=64,
    load_model_path=None,
    layer_attention=False,
    l2_norm=False,
    fusion=False
)

model_raw.train(
    train_filename=TRAIN_DATA,
    train_batch_size=32,
    num_train_epochs=5,
    learning_rate=4e-5,
    do_eval=True,
    dev_filename=DEV_DATA,
    eval_batch_size=64,
    output_dir=OUTPUT_STAGE1_RAW,
    gradient_accumulation_steps=1
)

print(f"✓ Raw Encoder saved")

# Save encoder riêng (QUAN TRỌNG!)
import torch
raw_encoder_state = {k: v for k, v in model_raw.model.state_dict().items() if 'raw_encoder' in k}
torch.save(raw_encoder_state, f'{OUTPUT_STAGE1_RAW}/raw_encoder.bin')

# ===== STAGE 1B: Template Encoder (FIXED) =====
print("\n[STAGE 1B] Training Template Encoder (temp_nl -> temp_code)")

# PATCH lại để dùng stage1_temp
model_module.read_examples = lambda f: __import__('utils_fixed').read_examples(f, stage='stage1_temp')

model_temp = CodeBert_Seq2Seq(
    ip_path=TAPT_OUTPUT,
    raw_path=TAPT_OUTPUT,
    decoder_layers=6,
    fix_encoder=False,
    beam_size=10,
    max_source_length=64,
    max_target_length=64,
    load_model_path=None,
    layer_attention=False,
    l2_norm=False,
    fusion=False
)

model_temp.train(
    train_filename=TRAIN_DATA,
    train_batch_size=32,
    num_train_epochs=5,
    learning_rate=4e-5,
    do_eval=True,
    dev_filename=DEV_DATA,
    eval_batch_size=64,
    output_dir=OUTPUT_STAGE1_TEMP,
    gradient_accumulation_steps=1
)

print(f"✓ Template Encoder saved")

# Save encoder riêng
temp_encoder_state = {k: v for k, v in model_temp.model.state_dict().items() if 'encoder' in k and 'raw_encoder' not in k}
torch.save(temp_encoder_state, f'{OUTPUT_STAGE1_TEMP}/temp_encoder.bin')

# ===== STAGE 2: Full ExploitGen (FIXED) =====
print("\n[STAGE 2] Training Full ExploitGen (raw_nl + temp_nl -> temp_code)")

# PATCH lại để dùng stage2
model_module.read_examples = lambda f: __import__('utils_fixed').read_examples(f, stage='stage2')

model_final = CodeBert_Seq2Seq(
    ip_path=TAPT_OUTPUT,  # Sẽ load template encoder
    raw_path=TAPT_OUTPUT, # Sẽ load raw encoder
    decoder_layers=6,
    fix_encoder=False,
    beam_size=10,
    max_source_length=64,
    max_target_length=64,
    load_model_path=None,
    layer_attention=True,
    l2_norm=True,
    fusion=True
)

# Load pre-trained encoders từ Stage 1
print("Loading Stage 1 encoders...")
model_dict = model_final.model.state_dict()

# Load Raw Encoder
raw_encoder_dict = torch.load(f'{OUTPUT_STAGE1_RAW}/raw_encoder.bin')
raw_encoder_dict = {k: v for k, v in raw_encoder_dict.items() if k in model_dict}
model_dict.update(raw_encoder_dict)

# Load Template Encoder  
temp_encoder_dict = torch.load(f'{OUTPUT_STAGE1_TEMP}/temp_encoder.bin')
temp_encoder_dict = {k: v for k, v in temp_encoder_dict.items() if k in model_dict}
model_dict.update(temp_encoder_dict)

model_final.model.load_state_dict(model_dict)

model_final.train(
    train_filename=TRAIN_DATA,
    train_batch_size=8,
    num_train_epochs=50,
    learning_rate=4e-5,
    do_eval=True,
    dev_filename=DEV_DATA,
    eval_batch_size=16,
    output_dir=OUTPUT_FINAL,
    gradient_accumulation_steps=4
)

print(f"✓ Final model saved")

# ===== EVALUATION =====
print("\n[EVALUATION] Testing...")

best_model_path = f'{OUTPUT_FINAL}/checkpoint-best-rouge/pytorch_model.bin'

model_eval = CodeBert_Seq2Seq(
    ip_path=TAPT_OUTPUT,
    raw_path=TAPT_OUTPUT,
    decoder_layers=6,
    fix_encoder=False,
    beam_size=10,
    max_source_length=64,
    max_target_length=64,
    load_model_path=best_model_path,
    layer_attention=True,
    l2_norm=True,
    fusion=True
)

model_eval.test(
    test_filename=TEST_DATA,
    test_batch_size=16,
    output_dir=f'{OUTPUT_FINAL}/test_results'
)

print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)