# Task 2: Citation-Reference Linking with RoBERTa

**Model:** roberta-base (State-of-the-art)

**Task:** Binary classification - Is this the correct bib entry?

**Features:**
- ✅ Memory efficient (streaming data)
- ✅ Auto resume from checkpoint
- ✅ Works on Colab Free

---

In [None]:
# Kaggle already has these packages installed!
# No need to install: transformers, datasets, accelerate

# Verify versions (optional)
import transformers, datasets, accelerate
print(f"✅ transformers: {transformers.__version__}")
print(f"✅ datasets: {datasets.__version__}")
print(f"✅ accelerate: {accelerate.__version__}")

In [None]:
# Data already unzipped by Kaggle - verify it
import os

train_path = '/kaggle/input/thesis-data-task2/train/train'
val_path = '/kaggle/input/thesis-data-task2/val/val'

train_count = len([f for f in os.listdir(train_path) if f.endswith('.in')])
val_count = len([f for f in os.listdir(val_path) if f.endswith('.in')])

print(f"✅ Train: {train_count} files")
print(f"✅ Val: {val_count} files")

In [None]:
# Load data - STREAMING with NEGATIVE SAMPLING (fix class imbalance)
import json
import random
from pathlib import Path
from datasets import IterableDataset

def generate_task2_examples(data_dir, neg_samples=5):
    """
    Generator with negative sampling to fix class imbalance
    
    Args:
        data_dir: Path to data directory
        neg_samples: Number of negative samples per positive (default: 5)
                     This creates 1:5 ratio instead of 1:99
    """
    data_path = Path(data_dir)
    in_files = sorted(data_path.glob("*.in"))
    
    total_files = len(in_files)
    print(f"📊 Found {total_files:,} .in files - streaming mode")
    print(f"⚖️ Using negative sampling: {neg_samples} negatives per positive")

    for i, in_file in enumerate(in_files):
        if (i+1) % 5000 == 0:
            print(f"⏳ Processed {i+1:,}/{total_files:,} files ({(i+1)*100//total_files}%)")
        
        with open(in_file) as f:
            in_data = json.load(f)

        label_file = in_file.with_suffix('.label')
        if not label_file.exists():
            continue
            
        with open(label_file) as f:
            label_data = json.load(f)

        text = in_data.get('text', '')
        bib_entries = in_data.get('bib_entries', {})
        citation_to_bib = label_data.get('correct_citation', {})

        for citation, correct_bib_id in citation_to_bib.items():
            citation_pos = text.find(citation)
            if citation_pos == -1:
                continue

            start = max(0, citation_pos - 200)
            end = min(len(text), citation_pos + len(citation) + 200)
            context = text[start:end]

            # POSITIVE example
            if correct_bib_id in bib_entries:
                bib_data = bib_entries[correct_bib_id]
                bib_text = bib_data.get('abstract', bib_data.get('title', ''))
                yield {
                    'context': context,
                    'bib_entry': bib_text,
                    'label': 1
                }

            # NEGATIVE examples - SAMPLE only N negatives (not all!)
            negative_bib_ids = [bid for bid in bib_entries.keys() if bid != correct_bib_id]
            
            # Sample random negatives (or all if fewer than neg_samples)
            sampled_negatives = random.sample(
                negative_bib_ids, 
                min(neg_samples, len(negative_bib_ids))
            )
            
            for bib_id in sampled_negatives:
                bib_data = bib_entries[bib_id]
                bib_text = bib_data.get('abstract', bib_data.get('title', ''))
                yield {
                    'context': context,
                    'bib_entry': bib_text,
                    'label': 0
                }
    
    print(f"✅ Finished processing all {total_files:,} files")

print("=" * 60)
print("Creating TRAIN dataset (streaming with negative sampling)...")
print("=" * 60)
train_dataset = IterableDataset.from_generator(
    generate_task2_examples, 
    gen_kwargs={'data_dir': '/kaggle/input/thesis-data-task2/train/train', 'neg_samples': 5}
)
print("✅ Train dataset ready")

print("\n" + "=" * 60)
print("Creating VAL dataset (streaming with negative sampling)...")
print("=" * 60)
val_dataset = IterableDataset.from_generator(
    generate_task2_examples,
    gen_kwargs={'data_dir': '/kaggle/input/thesis-data-task2/val/val', 'neg_samples': 5}
)
print("✅ Val dataset ready")

print("\n💡 Using IterableDataset with 1:5 positive:negative ratio!")
print("💡 This prevents class imbalance (instead of 1:99 ratio)")

In [None]:
# Tokenize - DYNAMIC PADDING (memory efficient)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    # Use padding=False, let DataCollator handle dynamic padding
    return tokenizer(
        examples['context'],
        examples['bib_entry'],
        max_length=512,
        padding=False,        # Changed from 'max_length' to False
        truncation=True
    )

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("✅ Tokenization complete (dynamic padding will be applied during training)")

In [None]:
# Load model
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

print(f"✅ RoBERTa-base loaded: {model.num_parameters():,} parameters")

In [None]:
# Training setup - OPTIMIZED for IterableDataset
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from pathlib import Path
import os

# Dynamic padding collator (saves VRAM!)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='/kaggle/working/checkpoints/task2_roberta',
    max_steps=10000,              # Use max_steps for IterableDataset (not epochs)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4, # Effective batch size = 32
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    save_strategy='steps',
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,                    # Mixed precision to save VRAM
    report_to='none',
    seed=42
)

# Check for checkpoint to resume from
checkpoint_dir = Path(training_args.output_dir)
checkpoints = sorted(checkpoint_dir.glob('checkpoint-*')) if checkpoint_dir.exists() else []

resume_checkpoint = str(checkpoints[-1]) if checkpoints else None

if resume_checkpoint:
    print(f"🔄 Resuming from: {Path(resume_checkpoint).name}")
else:
    print("🆕 Starting fresh training")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Use dynamic padding!
    compute_metrics=compute_metrics
)

print(f"\n💡 Training config:")
print(f"   - Effective batch size: {8 * 4} (per_device={8} × accumulation={4})")
print(f"   - Max steps: {10000}")
print(f"   - Dynamic padding: ON (saves VRAM)")
print(f"   - FP16: ON (saves VRAM)")

In [None]:
# Train
print("="*60)
print("🚀 STARTING TRAINING - RoBERTa")
print("="*60)

trainer.train(resume_from_checkpoint=resume_checkpoint)

print("\n✅ Training complete!")

In [None]:
# Evaluate
print("📊 VALIDATION RESULTS")
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Save final model
final_model_path = '/kaggle/working/models/task2_roberta_final'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"✅ Model saved to: {final_model_path}")
print("\n" + "="*60)
print("✅ TASK 2 - RoBERTa COMPLETE!")
print("="*60)