<a href="https://colab.research.google.com/github/ved1beta/RAG_model/blob/main/arvix_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
from datasets import Dataset

In [None]:
!kaggle datasets download cornell-university/arxiv
!unzip arxiv.zip


Dataset URL: https://www.kaggle.com/datasets/cornell-university/arxiv
License(s): CC0-1.0
Downloading arxiv.zip to /content
100% 1.36G/1.36G [00:10<00:00, 165MB/s]
100% 1.36G/1.36G [00:10<00:00, 138MB/s]
Archive:  arxiv.zip
  inflating: arxiv-metadata-oai-snapshot.json  


In [None]:
df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True)

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# Test access
print(dataset[100])

In [None]:

import pandas as pd
from datasets import Dataset
# Method 1: Load fewer rows
df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True, nrows=100000)  # adjust number as needed

# OR Method 2: Load chunks and process
chunk_size = 50000
chunks = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True, chunksize=chunk_size)
df = next(chunks)  # Gets first chunk only

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# Optional: Create train/test split
train_test = dataset.train_test_split(test_size=0.2, seed=42)

# Print info about the dataset
print(f"Dataset size: {len(dataset)}")
print("\nSample entry:")
print(dataset[0])

Dataset size: 50000

Sample entry:
{'id': 704.0001, 'submitter': 'Pavel Nadolsky', 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan", 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies', 'comments': '37 pages, 15 figures; published version', 'journal-ref': 'Phys.Rev.D76:013009,2007', 'doi': '10.1103/PhysRevD.76.013009', 'report-no': 'ANL-HEP-PR-07-12', 'categories': 'hep-ph', 'license': None, 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstr

In [None]:
# 1. First split the dataset into train and test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 2. Prepare the dataset with labels
# First, create a label mapping
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
all_categories = label_encoder.fit_transform(dataset['categories'])

# Add encoded labels to the dataset
train_test_dataset['train'] = train_test_dataset['train'].add_column('labels', label_encoder.transform(train_test_dataset['train']['categories']))
train_test_dataset['test'] = train_test_dataset['test'].add_column('labels', label_encoder.transform(train_test_dataset['test']['categories']))

def preprocess_function(examples):
    return tokenizer(
        examples['abstract'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors="pt"
    )

tokenized_train = train_test_dataset['train'].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_dataset['train'].column_names
)

tokenized_test = train_test_dataset['test'].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_dataset['test'].column_names
)

# 4. Add labels back to tokenized datasets
tokenized_train = tokenized_train.add_column('labels', train_test_dataset['train']['labels'])
tokenized_test = tokenized_test.add_column('labels', train_test_dataset['test']['labels'])

# 5. Set up training arguments
training_args = TrainingArguments(
    output_dir="./arxiv_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",        # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

# 6. Initialize model with correct number of labels
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

# 8. Train the model
trainer.train()

# 9. Save the model and label encoder
model.save_pretrained("./arxiv_model_final")
tokenizer.save_pretrained("./arxiv_model_final")
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# 10. Test the model
def test_model(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1)
    return label_encoder.inverse_transform(predictions)

Flattening the indices:   0%|          | 0/40000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# 1. First split the dataset into train and test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 2. Prepare the dataset with labels
# First, create a label mapping
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
all_categories = label_encoder.fit_transform(dataset['categories'])

# Add encoded labels to the dataset
train_test_dataset['train'] = train_test_dataset['train'].add_column('labels', label_encoder.transform(train_test_dataset['train']['categories']))
train_test_dataset['test'] = train_test_dataset['test'].add_column('labels', label_encoder.transform(train_test_dataset['test']['categories']))

# 3. Tokenize the datasets
def preprocess_function(examples):
    return tokenizer(
        examples['abstract'],
        truncation=True,
        padding='max_length',
        max_length=512
    )

tokenized_train = train_test_dataset['train'].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_dataset['train'].column_names
)

tokenized_test = train_test_dataset['test'].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_dataset['test'].column_names
)

# 4. Add labels back to tokenized datasets
tokenized_train = tokenized_train.add_column('labels', train_test_dataset['train']['labels'])
tokenized_test = tokenized_test.add_column('labels', train_test_dataset['test']['labels'])

# 5. Set up training arguments
training_args = TrainingArguments(
    output_dir="./arxiv_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",        # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

# 6. Initialize model with correct number of labels
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

# 8. Train the model
trainer.train()

# 9. Save the model and label encoder
model.save_pretrained("./arxiv_model_final")
tokenizer.save_pretrained("./arxiv_model_final")
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# 10. Test the model
def test_model(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1)
    return label_encoder.inverse_transform(predictions)

Flattening the indices:   0%|          | 0/40000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
