In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

!pip install -q transformers
!pip install -q peft
!pip install -q evaluate
!pip install -q scikit-learn
!pip install -q matplotlib seaborn
!pip install -q datasets
!pip install -q numpy

In [None]:
from datasets import load_dataset

# Loading the full dataset
dataset = load_dataset("ccdv/arxiv-classification")

In [None]:
print(dataset["train"].column_names)

In [None]:
print(dataset['train'][0])

In [None]:
unique_labels = list(set(dataset['train']['label']))
print(f"Number of unique labels: {len(unique_labels)}")
print(f"Unique labels:{(unique_labels)}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_label_distribution(split_name):
    
    # Extract labels
    labels = dataset[split_name]["label"]
    df = pd.DataFrame(labels, columns=["label"])
    value_counts = df["label"].value_counts().sort_index()

    # Plot
    plt.figure(figsize=(12, 4))
    sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")
    plt.title(f"Label Distribution in {split_name} set")
    plt.xlabel("Label ID")
    plt.ylabel("Count")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    # Print imbalance info
    print(f"\n {split_name.upper()} - label min/max count:")
    print(f"Min: {value_counts.min()}, Max: {value_counts.max()}, Unique labels: {value_counts.shape[0]}")
    if value_counts.min() / value_counts.max() < 0.5:
        print("Potential imbalance detected!")
    else:
        print("Labels appear relatively balanced.")

    # Check for missing values
    print("\n Checking for missing values:")
    df_check = pd.DataFrame(dataset[split_name])
    print(df_check.isnull().sum())

# Plot for each split
for split in ["train", "validation", "test"]:
    plot_label_distribution(split)


In [None]:
from transformers import AutoTokenizer

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)


tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_val = dataset["validation"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

In [None]:
# Metrics

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Model

from transformers import AutoModelForSequenceClassification

# Extract the label mapping from the dataset
label_list = tokenized_train.features["label"].names
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=11,
    id2label=id2label,
    label2id=label2id
)

In [None]:
# Apply LoRA

from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Training arguments

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="./results_arx",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    warmup_ratio = 0.1,
    metric_for_best_model="eval_loss",
    save_strategy="epoch",
    run_name="arxiv-classification",
    report_to=None,
)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
from sklearn.metrics import classification_report

# Get predictions on the validation set
predictions = trainer.predict(tokenized_val)

# Extract logits (probabilities) and true labels
preds = predictions.predictions.argmax(axis=1)
labels = predictions.label_ids

print(classification_report(labels, preds, target_names=label_list))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Constructing the confusion matrix
cm = confusion_matrix(labels, preds)

# Visualizing the matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_list)
fig, ax = plt.subplots(figsize=(10, 10))
disp.plot(ax=ax, cmap='Blues', xticks_rotation=45)
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Get predictions on the test set

test_predictions = trainer.predict(tokenized_test)
test_preds = test_predictions.predictions.argmax(axis=1)
test_labels = test_predictions.label_ids

print(classification_report(test_labels, test_preds, target_names=label_list))