# Task 2: Citation-Reference Linking with SciBERT

**Model:** allenai/scibert_scivocab_uncased (Cross-Encoder)

**Task:** Binary classification - Is this the correct bib entry?

**Features:**
- ✅ Memory efficient (streaming data)
- ✅ Auto resume from checkpoint
- ✅ Works on Colab Free

---

In [None]:
# Install dependencies
!pip install transformers datasets accelerate -q

In [None]:
# Data already unzipped by Kaggle - verify it
import os

train_path = '/kaggle/input/thesis-data-task2/train/train'
val_path = '/kaggle/input/thesis-data-task2/val/val'

train_count = len([f for f in os.listdir(train_path) if f.endswith('.in')])
val_count = len([f for f in os.listdir(val_path) if f.endswith('.in')])

print(f"✅ Train: {train_count} files")
print(f"✅ Val: {val_count} files")

In [None]:
# Load data (memory efficient)
import json
from pathlib import Path
from datasets import Dataset

def generate_task2_examples(data_dir):
    data_path = Path(data_dir)
    in_files = sorted(data_path.glob("*.in"))

    for in_file in in_files:
        with open(in_file) as f:
            in_data = json.load(f)

        label_file = in_file.with_suffix('.label')
        with open(label_file) as f:
            label_data = json.load(f)

        text = in_data['text']
        bib_entries = in_data['bib_entries']
        citation_to_bib = label_data['citation_to_bib']

        for citation, correct_bib_id in citation_to_bib.items():
            citation_pos = text.find(citation)
            if citation_pos == -1:
                continue

            start = max(0, citation_pos - 200)
            end = min(len(text), citation_pos + len(citation) + 200)
            context = text[start:end]

            # Positive example
            if correct_bib_id in bib_entries:
                yield {
                    'context': context,
                    'bib_entry': bib_entries[correct_bib_id],
                    'label': 1
                }

            # Negative examples
            for bib_id, bib_text in bib_entries.items():
                if bib_id != correct_bib_id:
                    yield {
                        'context': context,
                        'bib_entry': bib_text,
                        'label': 0
                    }

def create_dataset(data_dir):
    examples = list(generate_task2_examples(data_dir))
    return Dataset.from_dict({
        'context': [ex['context'] for ex in examples],
        'bib_entry': [ex['bib_entry'] for ex in examples],
        'label': [ex['label'] for ex in examples]
    })

print("Loading train dataset...")
train_dataset = create_dataset('/kaggle/input/thesis-data-task2/train/train')
print(f"✅ Train: {len(train_dataset):,} examples")

print("Loading val dataset...")
val_dataset = create_dataset('/kaggle/input/thesis-data-task2/val/val')
print(f"✅ Val: {len(val_dataset):,} examples")

In [None]:
# Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

def tokenize_function(examples):
    return tokenizer(
        examples['context'],
        examples['bib_entry'],
        max_length=512,
        padding='max_length',
        truncation=True
    )

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=1000)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=1000)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("✅ Tokenization complete!")

In [None]:
# Load model
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased',
    num_labels=2
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

print(f"✅ Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Training setup
from transformers import TrainingArguments, Trainer
from pathlib import Path

training_args = TrainingArguments(
    output_dir='/kaggle/working/checkpoints/task2_scibert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    save_strategy='steps',
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none',
    seed=42
)

# Check for checkpoint to resume from
checkpoint_dir = Path(training_args.output_dir)
checkpoints = sorted(checkpoint_dir.glob('checkpoint-*')) if checkpoint_dir.exists() else []

# Option: From Kaggle dataset (if you uploaded a checkpoint)
# Uncomment this if you added a checkpoint dataset:
# if not checkpoints and Path('/kaggle/input/task2-scibert-checkpoint').exists():
#     checkpoints = sorted(Path('/kaggle/input/task2-scibert-checkpoint').glob('checkpoint-*'))

resume_checkpoint = str(checkpoints[-1]) if checkpoints else None

if resume_checkpoint:
    print(f"🔄 Resuming from: {Path(resume_checkpoint).name}")
else:
    print("🆕 Starting fresh training")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train
print("="*60)
print("🚀 STARTING TRAINING")
print("="*60)

trainer.train(resume_from_checkpoint=resume_checkpoint)

print("\n✅ Training complete!")

In [None]:
# Evaluate
print("📊 VALIDATION RESULTS")
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Save final model
final_model_path = '/kaggle/working/models/task2_scibert_final'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"✅ Model saved to: {final_model_path}")
print("\n💡 TIP: Click 'Save Version' to commit and save this model permanently!")

In [None]:
# Test inference
import torch
from transformers import pipeline

classifier = pipeline(
    'text-classification',
    model=final_model_path,
    tokenizer=final_model_path,
    device=0 if torch.cuda.is_available() else -1
)

# Test on validation example
result = classifier("Test citation context [SEP] Test bibliography entry")

print("\n📋 Test Inference:")
print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})")
print("\n" + "="*60)
print("✅ TASK 2 - SciBERT COMPLETE!")
print("="*60)