# Task 2: Citation-Reference Linking with SciBERT

**Model:** allenai/scibert_scivocab_uncased (Cross-Encoder)

**Task:** Binary classification - Is this the correct bib entry?

**Features:**
- ‚úÖ Memory efficient (streaming data)
- ‚úÖ Auto resume from checkpoint
- ‚úÖ Works on Colab Free

---

In [1]:
# Kaggle already has these packages installed!
# No need to install: transformers, datasets, accelerate

# Verify versions (optional)
import transformers, datasets, accelerate
print(f"‚úÖ transformers: {transformers.__version__}")
print(f"‚úÖ datasets: {datasets.__version__}")
print(f"‚úÖ accelerate: {accelerate.__version__}")

‚úÖ transformers: 4.57.1
‚úÖ datasets: 4.4.1
‚úÖ accelerate: 1.11.0


In [2]:
# Data already unzipped by Kaggle - verify it
import os

train_path = '/kaggle/input/thesis-data-task2/train/train'
val_path = '/kaggle/input/thesis-data-task2/val/val'

train_count = len([f for f in os.listdir(train_path) if f.endswith('.in')])
val_count = len([f for f in os.listdir(val_path) if f.endswith('.in')])

print(f"‚úÖ Train: {train_count} files")
print(f"‚úÖ Val: {val_count} files")

‚úÖ Train: 55556 files
‚úÖ Val: 3000 files


In [3]:
# Load data - STREAMING with NEGATIVE SAMPLING (fix class imbalance)
import json
import random
from pathlib import Path
from datasets import IterableDataset

def generate_task2_examples(data_dir, neg_samples=5):
    """
    Generator with negative sampling to fix class imbalance
    
    Args:
        data_dir: Path to data directory
        neg_samples: Number of negative samples per positive (default: 5)
                     This creates 1:5 ratio instead of 1:99
    """
    data_path = Path(data_dir)
    in_files = sorted(data_path.glob("*.in"))
    
    total_files = len(in_files)
    print(f"üìä Found {total_files:,} .in files - streaming mode")
    print(f"‚öñÔ∏è Using negative sampling: {neg_samples} negatives per positive")

    for i, in_file in enumerate(in_files):
        if (i+1) % 5000 == 0:
            print(f"‚è≥ Processed {i+1:,}/{total_files:,} files ({(i+1)*100//total_files}%)")
        
        with open(in_file) as f:
            in_data = json.load(f)

        label_file = in_file.with_suffix('.label')
        if not label_file.exists():
            continue
            
        with open(label_file) as f:
            label_data = json.load(f)

        text = in_data.get('text', '')
        bib_entries = in_data.get('bib_entries', {})
        citation_to_bib = label_data.get('correct_citation', {})

        for citation, correct_bib_id in citation_to_bib.items():
            citation_pos = text.find(citation)
            if citation_pos == -1:
                continue

            start = max(0, citation_pos - 200)
            end = min(len(text), citation_pos + len(citation) + 200)
            context = text[start:end]

            # POSITIVE example
            if correct_bib_id in bib_entries:
                bib_data = bib_entries[correct_bib_id]
                bib_text = bib_data.get('abstract', bib_data.get('title', ''))
                yield {
                    'context': context,
                    'bib_entry': bib_text,
                    'label': 1
                }

            # NEGATIVE examples - SAMPLE only N negatives (not all!)
            negative_bib_ids = [bid for bid in bib_entries.keys() if bid != correct_bib_id]
            
            # Sample random negatives (or all if fewer than neg_samples)
            sampled_negatives = random.sample(
                negative_bib_ids, 
                min(neg_samples, len(negative_bib_ids))
            )
            
            for bib_id in sampled_negatives:
                bib_data = bib_entries[bib_id]
                bib_text = bib_data.get('abstract', bib_data.get('title', ''))
                yield {
                    'context': context,
                    'bib_entry': bib_text,
                    'label': 0
                }
    
    print(f"‚úÖ Finished processing all {total_files:,} files")

print("=" * 60)
print("Creating TRAIN dataset (streaming with negative sampling)...")
print("=" * 60)
train_dataset = IterableDataset.from_generator(
    generate_task2_examples, 
    gen_kwargs={'data_dir': '/kaggle/input/thesis-data-task2/train/train', 'neg_samples': 5}
)
print("‚úÖ Train dataset ready")

print("\n" + "=" * 60)
print("Creating VAL dataset (streaming with negative sampling)...")
print("=" * 60)
val_dataset = IterableDataset.from_generator(
    generate_task2_examples,
    gen_kwargs={'data_dir': '/kaggle/input/thesis-data-task2/val/val', 'neg_samples': 5}
)
print("‚úÖ Val dataset ready")

print("\nüí° Using IterableDataset with 1:5 positive:negative ratio!")
print("üí° This prevents class imbalance (instead of 1:99 ratio)")

Creating TRAIN dataset (streaming with negative sampling)...
‚úÖ Train dataset ready

Creating VAL dataset (streaming with negative sampling)...
‚úÖ Val dataset ready

üí° Using IterableDataset with 1:5 positive:negative ratio!
üí° This prevents class imbalance (instead of 1:99 ratio)


In [4]:
# Tokenize - DYNAMIC PADDING (memory efficient)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

def tokenize_function(examples):
    # Use padding=False, let DataCollator handle dynamic padding
    return tokenizer(
        examples['context'],
        examples['bib_entry'],
        max_length=512,
        padding=False,        # Changed from 'max_length' to False
        truncation=True
    )

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("‚úÖ Tokenization complete (dynamic padding will be applied during training)")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Tokenizing datasets...
‚úÖ Tokenization complete (dynamic padding will be applied during training)


In [5]:
# Load model
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased',
    num_labels=2
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

print(f"‚úÖ Model loaded: {model.num_parameters():,} parameters")

2026-01-13 08:19:31.011324: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768292371.225637      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768292371.288694      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768292371.807668      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768292371.807718      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768292371.807720      24 computation_placer.cc:177] computation placer alr

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded: 109,920,002 parameters


In [6]:
# Training setup - OPTIMIZED for IterableDataset
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from pathlib import Path
import os

# Dynamic padding collator (saves VRAM!)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='/kaggle/working/checkpoints/task2_scibert',
    max_steps=10000,              # Use max_steps for IterableDataset (not epochs)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4, # Effective batch size = 32
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    save_strategy='steps',
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,                    # Mixed precision to save VRAM
    report_to='none',
    seed=42
)

# Check for checkpoint to resume from
checkpoint_dir = Path(training_args.output_dir)
checkpoints = sorted(checkpoint_dir.glob('checkpoint-*')) if checkpoint_dir.exists() else []

resume_checkpoint = str(checkpoints[-1]) if checkpoints else None

if resume_checkpoint:
    print(f"üîÑ Resuming from: {Path(resume_checkpoint).name}")
else:
    print("üÜï Starting fresh training")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Use dynamic padding!
    compute_metrics=compute_metrics
)

print(f"\nüí° Training config:")
print(f"   - Effective batch size: {8 * 4} (per_device={8} √ó accumulation={4})")
print(f"   - Max steps: {10000}")
print(f"   - Dynamic padding: ON (saves VRAM)")
print(f"   - FP16: ON (saves VRAM)")

üÜï Starting fresh training

üí° Training config:
   - Effective batch size: 32 (per_device=8 √ó accumulation=4)
   - Max steps: 10000
   - Dynamic padding: ON (saves VRAM)
   - FP16: ON (saves VRAM)


  trainer = Trainer(


In [7]:
# Train
print("="*60)
print("üöÄ STARTING TRAINING")
print("="*60)

trainer.train(resume_from_checkpoint=resume_checkpoint)

print("\n‚úÖ Training complete!")

üöÄ STARTING TRAINING
üìä Found 55,556 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.389,0.423253,0.792996,0.429433,0.602539,0.501467
1000,0.3586,0.361369,0.849853,0.68309,0.244429,0.360029
1500,0.3603,0.365754,0.837594,0.5324,0.493653,0.512295
2000,0.346,0.345863,0.854288,0.726827,0.251058,0.373205
2500,0.377,0.336957,0.856944,0.65443,0.364598,0.468297
3000,0.3482,0.348003,0.85653,0.664299,0.343018,0.452423
3500,0.3757,0.339011,0.856043,0.732052,0.263188,0.387177
4000,0.3379,0.338021,0.857407,0.732458,0.275317,0.400205
4500,0.3312,0.341843,0.859138,0.698245,0.325388,0.44391
5000,0.3576,0.340005,0.860478,0.703185,0.333145,0.452101


üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
‚è≥ Processed 5,000/55,556 files (8%)
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished pr

In [8]:
# Evaluate
print("üìä VALIDATION RESULTS")
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

üìä VALIDATION RESULTS
üìä Found 3,000 .in files - streaming mode
‚öñÔ∏è Using negative sampling: 5 negatives per positive
‚úÖ Finished processing all 3,000 files
eval_loss: 0.3411
eval_accuracy: 0.8539
eval_precision: 0.6007
eval_recall: 0.4609
eval_f1: 0.5216
eval_runtime: 641.1601
eval_samples_per_second: 63.9980
eval_steps_per_second: 4.0010
epoch: 1.0000


In [9]:
# Save final model
final_model_path = '/kaggle/working/models/task2_scibert_final'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"‚úÖ Model saved to: {final_model_path}")
print("\nüí° TIP: Click 'Save Version' to commit and save this model permanently!")

‚úÖ Model saved to: /kaggle/working/models/task2_scibert_final

üí° TIP: Click 'Save Version' to commit and save this model permanently!


In [10]:
# Test inference
import torch
from transformers import pipeline

classifier = pipeline(
    'text-classification',
    model=final_model_path,
    tokenizer=final_model_path,
    device=0 if torch.cuda.is_available() else -1
)

# Test on validation example
result = classifier("Test citation context [SEP] Test bibliography entry")

print("\nüìã Test Inference:")
print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})")
print("\n" + "="*60)
print("‚úÖ TASK 2 - SciBERT COMPLETE!")
print("="*60)

Device set to use cuda:0



üìã Test Inference:
Prediction: LABEL_0 (score: 0.7192)

‚úÖ TASK 2 - SciBERT COMPLETE!
