In [None]:
!pip install datasets

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load your labeled data (Make sure the file has the columns: 'PICO', 'Paper', and 'SimilarityScore')
df = pd.read_csv('paper_data_labeled.csv')

# Create a dataset from the dataframe
dataset = Dataset.from_pandas(df)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the 'PICO' and 'Paper' columns together (pair the text)
def tokenize_function(examples):
    return tokenizer(examples['PICO'], examples['Paper'], padding='max_length', truncation=True)

# Apply the tokenizer to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Split into train and validation sets (80-20 split)
train_dataset = dataset.shuffle(seed=42).select([i for i in list(range(int(0.8 * len(dataset))))])
eval_dataset = dataset.shuffle(seed=42).select([i for i in list(range(int(0.8 * len(dataset))), len(dataset))])

# Load the pre-trained BERT model for sequence classification (regression task)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # log every 10 steps
)

# Define a function to compute the metrics (using MSE to measure similarity)
def compute_metrics(p):
    preds = p.predictions
    labels = p.label_ids
    mse = mean_squared_error(labels, preds)
    return {"mse": mse}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
trainer.save_model('./bert_similarity_model')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|          | 0/5 [00:00<?, ? examples/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Map: 100%|██

TypeError: list expected at most 1 argument, got 2