In [3]:
import json
import pandas as pd
import torch
from datasets import Dataset, train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load metadata from JSON
with open("metadata.json", "r") as f:
    metadata = json.load(f)

In [None]:
# Extract relevant fields
def process_metadata(metadata):
    papers = []
    for paper in metadata["data"]:
        title = paper.get("title", "")
        abstract = paper.get("abstract", "")
        keywords = ", ".join(paper.get("fieldsOfStudy", []))
        text = f"{title}. {abstract} Keywords: {keywords}"
        label = 1  # Change this based on your criteria
        papers.append({"text": text, "label": label})
    return pd.DataFrame(papers)

In [None]:
df = process_metadata(metadata)

tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset, val_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

In [None]:
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

model.save_pretrained("scibert_relevance_model")
tokenizer.save_pretrained("scibert_relevance_model")

In [None]:
def classify_paper(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return {"label": "Relevant" if predicted_label == 1 else "Not Relevant", "confidence": probabilities.tolist()}

In [None]:
# Example inference
sample_text = "Deep learning models are advancing NLP rapidly. Keywords: Machine Learning, AI."
print(classify_paper(sample_text))