# Longformer Fine-tuning with LoRA: Review Classification

Use Longformer model with LoRA to distinguish between real reviews and AI-generated reviews.

- Longformer supports up to 4096 tokens for long text
- Task: Binary classification (Real reviews vs AI-generated reviews)
- **Using LoRA for memory-efficient fine-tuning**


## 1. Install Dependencies


In [1]:
%pip install transformers datasets accelerate peft -U


Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's dependency resolver doe

## 2. Mount Google Drive (if data is in Drive)


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## 3. Check GPU


In [3]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA L4


## 4. Import Libraries


In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType
import random

# Set random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


## 5. Define Dataset Class


In [5]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=2048):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


## 6. Data Loading Function


In [6]:
def load_and_process_reviews(real_csv_path, ai_csv_path):
    """
    Load review data from two CSV files.
    real_csv_path: CSV file containing real reviews, label 0.
    ai_csv_path: CSV file containing AI-generated reviews, label 1.
    Data is balanced to have equal numbers of both classes.
    """
    real_reviews = []
    ai_reviews = []

    # Load real reviews (label 0)
    print(f"Loading real reviews from: {real_csv_path}")
    df_real = pd.read_csv(real_csv_path)
    for idx, row in df_real.iterrows():
        review_text = row['review_text']
        if pd.notna(review_text) and len(str(review_text).strip()) > 0:
            real_reviews.append(str(review_text))
    print(f"Loaded {len(real_reviews)} real reviews (label 0)")

    # Load AI-generated reviews (label 1)
    print(f"Loading AI reviews from: {ai_csv_path}")
    df_ai = pd.read_csv(ai_csv_path)
    for idx, row in df_ai.iterrows():
        review_text = row['review_text']
        if pd.notna(review_text) and len(str(review_text).strip()) > 0:
            ai_reviews.append(str(review_text))
    print(f"Loaded {len(ai_reviews)} AI reviews (label 1)")

    # Balance the dataset - use the minimum count
    min_count = min(len(real_reviews), len(ai_reviews))
    print(f"\nBalancing dataset to {min_count} samples per class...")

    # Randomly sample to balance classes
    if len(real_reviews) > min_count:
        real_reviews = random.sample(real_reviews, min_count)
    if len(ai_reviews) > min_count:
        ai_reviews = random.sample(ai_reviews, min_count)

    # Combine reviews and labels
    reviews = real_reviews + ai_reviews
    labels = [0] * len(real_reviews) + [1] * len(ai_reviews)

    print(f"\nBalanced dataset:")
    print(f"  - Label 0 (Real reviews): {labels.count(0)}")
    print(f"  - Label 1 (AI reviews): {labels.count(1)}")
    print(f"  - Total: {len(reviews)}")

    return reviews, labels


## 7. Configuration Parameters

**Important: Change the paths below to your actual data paths!**


In [11]:
# Data paths
REAL_REVIEW_PATH = '/content/drive/MyDrive/Notebooks/AI_review/iclr_2020_data/iclr_2020_reviews.csv'
AI_REVIEW_PATH = '/content/drive/MyDrive/Notebooks/AI_review/ai_review_2020.csv'

# Model configuration
MODEL_NAME = 'allenai/longformer-base-4096'
OUTPUT_DIR = '/content/drive/MyDrive/Notebooks/AI_review/finetuned_longformer_lora1'
MAX_LENGTH = 2048
BATCH_SIZE = 8  # Can try increasing batch size with LoRA
EPOCHS = 3
LEARNING_RATE = 2e-4  # LoRA typically uses higher learning rate

# LoRA configuration
LORA_R = 8  # LoRA rank, try 4, 8, or 16
LORA_ALPHA = 16  # Usually set to 2 * rank
LORA_DROPOUT = 0.1

print("Configuration completed!")
print(f"Model: {MODEL_NAME}")
print(f"Max length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")
print(f"LoRA rank: {LORA_R}")

Configuration completed!
Model: allenai/longformer-base-4096
Max length: 2048
Batch size: 8
Epochs: 3
LoRA rank: 8


In [8]:
reviews, labels = load_and_process_reviews(REAL_REVIEW_PATH, AI_REVIEW_PATH)
print("Tokenizing all reviews to compute token length statistics...")

# Load tokenizer first (if not already loaded)
if 'tokenizer' not in locals():
    print(f"Loading tokenizer: {MODEL_NAME}")
    tokenizer = LongformerTokenizer.from_pretrained(MODEL_NAME)

# Tokenize all reviews and get token counts
token_lengths = []
for i, review in enumerate(reviews):
    if i % 100 == 0:
        print(f"Processing {i}/{len(reviews)}...")
    tokens = tokenizer(str(review), truncation=False, add_special_tokens=True)
    token_lengths.append(len(tokens['input_ids']))

# Calculate statistics
token_lengths = np.array(token_lengths)
mean_length = token_lengths.mean()
max_length = token_lengths.max()

print("\n" + "="*50)
print("TOKEN LENGTH STATISTICS")
print("="*50)
print(f"Total reviews: {len(token_lengths)}")
print(f"Average tokens: {mean_length:.0f}")
print(f"Max tokens: {max_length}")
print(f"\nCurrent MAX_LENGTH in config: {MAX_LENGTH}")
print("="*50)


Loading real reviews from: /content/drive/MyDrive/Notebooks/AI_review/iclr_2020_data/iclr_2020_reviews.csv
Loaded 306 real reviews (label 0)
Loading AI reviews from: /content/drive/MyDrive/Notebooks/AI_review/ai_review_2020.csv
Loaded 100 AI reviews (label 1)

Balancing dataset to 100 samples per class...

Balanced dataset:
  - Label 0 (Real reviews): 100
  - Label 1 (AI reviews): 100
  - Total: 200
Tokenizing all reviews to compute token length statistics...
Loading tokenizer: allenai/longformer-base-4096


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Processing 0/200...
Processing 100/200...

TOKEN LENGTH STATISTICS
Total reviews: 200
Average tokens: 561
Max tokens: 1649

Current MAX_LENGTH in config: 2048


## 9. Split Train and Validation Sets


In [9]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reviews, labels, test_size=0.2, random_state=SEED, stratify=labels
)

print(f"Data split:")
print(f"Training set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")


Data split:
Training set: 160 samples
Validation set: 40 samples


## 10. Load Model and Tokenizer


In [12]:
print(f"Loading Longformer model: {MODEL_NAME}")
print("First run will download the model, may take a few minutes...")

tokenizer = LongformerTokenizer.from_pretrained(MODEL_NAME)
model = LongformerForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# Configure LoRA
print("Configuring LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=LORA_R,  # LoRA rank - smaller rank = fewer parameters, less memory
    lora_alpha=LORA_ALPHA,  # LoRA scaling parameter
    lora_dropout=LORA_DROPOUT,  # Dropout probability
    target_modules=["query", "value"],  # Apply LoRA to attention query and value
    bias="none",  # Don't train bias parameters
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
print("LoRA configured!")
model.print_trainable_parameters()  # Print number and percentage of trainable parameters

# Enable gradient checkpointing to save more memory
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
print("Gradient checkpointing enabled for memory efficiency!")

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Model loaded! Using device: {device}")


Loading Longformer model: allenai/longformer-base-4096
First run will download the model, may take a few minutes...


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Configuring LoRA...
LoRA configured!
trainable params: 887,042 || all params: 149,548,036 || trainable%: 0.5931
Gradient checkpointing enabled for memory efficiency!
Model loaded! Using device: cuda


## 11. Create Datasets


In [13]:
print("Creating datasets...")
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
print("Datasets created!")


Creating datasets...
Datasets created!


## 12. Set Training Arguments


In [14]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    logging_steps=50,
    report_to="none",  # Disable wandb and other logging, no account needed
    gradient_accumulation_steps=2,  # Gradient accumulation to reduce memory usage
    fp16=True,  # Mixed precision training to reduce memory
    optim="adamw_torch",  # Optimizer
    warmup_steps=100,  # Learning rate warmup steps
    weight_decay=0.01,  # Weight decay
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load best model at end of training
    metric_for_best_model="eval_loss",  # Metric for selecting best model
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

)


In [15]:
print("Starting training...")
trainer.train()
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"\nEvaluation results: {eval_results}")


Starting training...


Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss
1,0.6835,0.682141
2,0.6772,0.665796
3,0.6586,0.645526


Evaluating model...



Evaluation results: {'eval_loss': 0.645526111125946, 'eval_runtime': 4.064, 'eval_samples_per_second': 9.843, 'eval_steps_per_second': 1.23, 'epoch': 3.0}


## 16. Save Model


In [None]:
print(f"Saving LoRA model to: {OUTPUT_DIR}")
# Save LoRA adapter weights
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("LoRA weights and tokenizer saved!")
print(f"\nTo load the model later, use:")
print(f"  from peft import PeftModel")
print(f"  base_model = LongformerForSequenceClassification.from_pretrained('{MODEL_NAME}', num_labels=2)")
print(f"  model = PeftModel.from_pretrained(base_model, '{OUTPUT_DIR}')")

In [None]:
# Test sample
test_text = "This paper presents an interesting approach to solving the problem..."

# Encode
inputs = tokenizer(test_text, return_tensors='pt', truncation=True, max_length=MAX_LENGTH)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()

print(f"Prediction: {'AI-generated' if predicted_class == 1 else 'Real review'}")
print(f"Confidence: {predictions[0][predicted_class].item():.4f}")
print(f"\nClass probabilities:")
print(f"  Real review (0): {predictions[0][0].item():.4f}")
print(f"  AI-generated (1): {predictions[0][1].item():.4f}")
