In [1]:
import os
import pandas as pd
import numpy as np
import regex as re
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
# Function to read text files and extract required data
def parse_text_files(base_path):
    data = []
    
    for category in ['Cancer', 'Non-Cancer']:
        folder_path = os.path.join(base_path, category)
        
        if not os.path.exists(folder_path):
            continue
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            
            # Ensure we only process text files
            if not filename.endswith(".txt"):
                continue

            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
                # Extracting ID, Title, and Abstract
                id_ = filename.replace('.txt', '')  # File name = ID
                title = ''
                abstract = ''
                if len(lines) > 1:
                    title = lines[1].strip().replace('Title: ', '')  # Second line = title
                    title = re.sub(r'^\W+|\W+$', '', title)
                if len(lines) > 2:
                    abstract = ' '.join(line.strip() for line in lines[2:])  # Rest is Abstract
                    abstract = abstract.replace('Abstract: ', '')
                    abstract = re.sub(r'^\W+|\W+$', '', abstract)
                data.append([id_, title, abstract, category])

    # Creating DataFrame
    df = pd.DataFrame(data, columns=["ID", "Title", "Abstract", "Category"])
    return df

# Parse dataset and create DataFrame
df = parse_text_files('Dataset')
df

Unnamed: 0,ID,Title,Abstract,Category
0,31055803,Analysis of age-specific cytogenetic changes a...,OBJECTIVE: To characterize cytogenetic changes...,Cancer
1,31164412,T-Cell Deletion of MyD88 Connects IL17 and Ika...,Cancer development requires a favorable tissue...,Cancer
2,31094905,MYCN Amplified Relapse Following Resolution of...,Congenital neuroblastoma with placental involv...,Cancer
3,31498304,In Vivo Inhibition of MicroRNA to Decrease Tum...,MicroRNAs (miRNAs) are important regulators of...,Cancer
4,30897768,Breast Cancer and miR-SNPs: The Importance of ...,Recent studies in cancer diagnostics have iden...,Cancer
...,...,...,...,...
995,26095439,Urinary 11beta-PGF2alpha and N-methyl histamin...,BACKGROUND: The utility of measuring histamine...,Non-Cancer
996,24850616,A limited form of proteus syndrome with bilate...,IMPORTANCE: Proteus syndrome is an extremely r...,Non-Cancer
997,24402730,Benign mast cell hyperplasia and atypical mast...,Introduction. Lichen planus (LP) is a chronic ...,Non-Cancer
998,26513044,Nevus anemicus associated with neurofibromatos...,Neurofibromatosis type 1 (NF1) is a multisyste...,Non-Cancer


In [3]:
    df = df.dropna(subset=['Title', 'Abstract', 'Category'])
df

Unnamed: 0,ID,Title,Abstract,Category
0,31055803,Analysis of age-specific cytogenetic changes a...,OBJECTIVE: To characterize cytogenetic changes...,Cancer
1,31164412,T-Cell Deletion of MyD88 Connects IL17 and Ika...,Cancer development requires a favorable tissue...,Cancer
2,31094905,MYCN Amplified Relapse Following Resolution of...,Congenital neuroblastoma with placental involv...,Cancer
3,31498304,In Vivo Inhibition of MicroRNA to Decrease Tum...,MicroRNAs (miRNAs) are important regulators of...,Cancer
4,30897768,Breast Cancer and miR-SNPs: The Importance of ...,Recent studies in cancer diagnostics have iden...,Cancer
...,...,...,...,...
995,26095439,Urinary 11beta-PGF2alpha and N-methyl histamin...,BACKGROUND: The utility of measuring histamine...,Non-Cancer
996,24850616,A limited form of proteus syndrome with bilate...,IMPORTANCE: Proteus syndrome is an extremely r...,Non-Cancer
997,24402730,Benign mast cell hyperplasia and atypical mast...,Introduction. Lichen planus (LP) is a chronic ...,Non-Cancer
998,26513044,Nevus anemicus associated with neurofibromatos...,Neurofibromatosis type 1 (NF1) is a multisyste...,Non-Cancer


In [4]:
from huggingface_hub import login
login(token="<token>")
# Load BioMiniBERT tokenizer and model
MODEL_NAME = "nlpie/tiny-biobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
# Create Label column Cancer = 1 and Non-Cancer = 0
label_mapping = {'Non-Cancer': 0, 'Cancer': 1}
df['Label'] = df['Category'].map(label_mapping)
df

Unnamed: 0,ID,Title,Abstract,Category,Label
0,31055803,Analysis of age-specific cytogenetic changes a...,OBJECTIVE: To characterize cytogenetic changes...,Cancer,1
1,31164412,T-Cell Deletion of MyD88 Connects IL17 and Ika...,Cancer development requires a favorable tissue...,Cancer,1
2,31094905,MYCN Amplified Relapse Following Resolution of...,Congenital neuroblastoma with placental involv...,Cancer,1
3,31498304,In Vivo Inhibition of MicroRNA to Decrease Tum...,MicroRNAs (miRNAs) are important regulators of...,Cancer,1
4,30897768,Breast Cancer and miR-SNPs: The Importance of ...,Recent studies in cancer diagnostics have iden...,Cancer,1
...,...,...,...,...,...
995,26095439,Urinary 11beta-PGF2alpha and N-methyl histamin...,BACKGROUND: The utility of measuring histamine...,Non-Cancer,0
996,24850616,A limited form of proteus syndrome with bilate...,IMPORTANCE: Proteus syndrome is an extremely r...,Non-Cancer,0
997,24402730,Benign mast cell hyperplasia and atypical mast...,Introduction. Lichen planus (LP) is a chronic ...,Non-Cancer,0
998,26513044,Nevus anemicus associated with neurofibromatos...,Neurofibromatosis type 1 (NF1) is a multisyste...,Non-Cancer,0


In [6]:
# Split data into Train (70%), Validation (10%) & Test (20%)
train_texts, tmp_texts, train_labels, tmp_labels = train_test_split(df['Abstract'], df['Label'], test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(df['Abstract'], df['Label'], test_size=2/3, random_state=42)

# Tokenize text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

# Convert to HuggingFace Dataset
train_dataset = HFDataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": list(train_labels)})
val_dataset = HFDataset.from_dict({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"], "labels": list(val_labels)})
test_dataset = HFDataset.from_dict({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"], "labels": list(test_labels)})

In [7]:
# Load BioMiniBERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
device = 'cpu'
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpie/tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [8]:
# Performance on base model
import torch.nn.functional as F
def predict(text):
    # Tokenize the input and move to device
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)

    # Get model predictions without computing gradients
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1).squeeze().cpu().numpy()  # Convert to NumPy
        predicted_label = "Cancer" if torch.argmax(logits, dim=1).item() == 1 else "Non-Cancer"

    return pd.Series([predicted_label, float(probs[0]), float(probs[1])])

# Apply predictions to the dataset
df[["Predicted_Category", "Non-Cancer Score", "Cancer Score"]] = df["Abstract"].apply(predict)

# Evaluate performance
accuracy = accuracy_score(df["Category"], df["Predicted_Category"])
f1 = f1_score(df["Category"], df["Predicted_Category"], pos_label="Cancer")
conf_matrix = confusion_matrix(df["Category"], df["Predicted_Category"])

# Display results
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.46
F1 Score: 0.07
Confusion Matrix:
 [[ 20 480]
 [ 57 443]]


In [9]:
df

Unnamed: 0,ID,Title,Abstract,Category,Label,Predicted_Category,Non-Cancer Score,Cancer Score
0,31055803,Analysis of age-specific cytogenetic changes a...,OBJECTIVE: To characterize cytogenetic changes...,Cancer,1,Non-Cancer,0.509001,0.490999
1,31164412,T-Cell Deletion of MyD88 Connects IL17 and Ika...,Cancer development requires a favorable tissue...,Cancer,1,Non-Cancer,0.501005,0.498995
2,31094905,MYCN Amplified Relapse Following Resolution of...,Congenital neuroblastoma with placental involv...,Cancer,1,Non-Cancer,0.501208,0.498792
3,31498304,In Vivo Inhibition of MicroRNA to Decrease Tum...,MicroRNAs (miRNAs) are important regulators of...,Cancer,1,Non-Cancer,0.501152,0.498848
4,30897768,Breast Cancer and miR-SNPs: The Importance of ...,Recent studies in cancer diagnostics have iden...,Cancer,1,Non-Cancer,0.506756,0.493244
...,...,...,...,...,...,...,...,...
995,26095439,Urinary 11beta-PGF2alpha and N-methyl histamin...,BACKGROUND: The utility of measuring histamine...,Non-Cancer,0,Non-Cancer,0.510474,0.489526
996,24850616,A limited form of proteus syndrome with bilate...,IMPORTANCE: Proteus syndrome is an extremely r...,Non-Cancer,0,Non-Cancer,0.504556,0.495444
997,24402730,Benign mast cell hyperplasia and atypical mast...,Introduction. Lichen planus (LP) is a chronic ...,Non-Cancer,0,Non-Cancer,0.506013,0.493987
998,26513044,Nevus anemicus associated with neurofibromatos...,Neurofibromatosis type 1 (NF1) is a multisyste...,Non-Cancer,0,Non-Cancer,0.502532,0.497468


In [10]:
from sklearn.model_selection import ParameterGrid
def fine_tune(model, tokenizer, train_dataset, val_dataset, output_dir, batch_size=8, 
              num_epochs=3, weight_decay=0.01, learning_rate=5e-05, use_cuda=False):
    """
    Fine-tune model with custom hyperparameters.

    Parameters:
    - model: HuggingFace model
    - tokenizer: HuggingFace tokenizer
    - train_dataset: Dataset for training
    - val_dataset: Dataset for evaluation
    - output_dir: Directory to save fine-tuned model
    - batch_size: Training and eval batch size
    - num_epochs: Number of training epochs
    - weight_decay: Weight decay (L2 regularization)
    - learning_rate: Learning Rate
    - use_cuda: Use GPU if True

    Returns:
    - trainer: Trained HuggingFace Trainer object
    - metrics: Final evaluation metrics
    """

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        learning_rate=learning_rate,
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        logging_dir=f"{output_dir}/logs",
        load_best_model_at_end=True,
        no_cuda=not use_cuda
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        return {"accuracy": acc, "f1": f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Save model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    metrics = trainer.evaluate(test_dataset)
    return trainer, metrics

param_grid = {
    "batch_size": [8, 16],
    "num_epochs": [2, 3, 5],
    "weight_decay": [0.0, 0.01, 0.1],
    "learning_rate": [5e-5, 3e-5, 2e-5],
    "warmup_steps": [0, 100, 500]
}

best_f1 = 0
best_params = None
best_trainer = None

for params in ParameterGrid(param_grid):
    print(f"Testing params: {params}")
    trainer, metrics = fine_tune(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        output_dir=f"./biobert_{params['batch_size']}_{params['num_epochs']}_{params['weight_decay']}",
        batch_size=params['batch_size'],
        num_epochs=params['num_epochs'],
        weight_decay=params['weight_decay'],
        learning_rate=params['learning_rate'],
        use_cuda=True
    )

    f1 = metrics["eval_f1"]
    print("F1 Score:", f1)
    if f1 > best_f1:
        best_f1 = f1
        best_params = params
        best_trainer = trainer

print(f"\nBest F1: {best_f1:.4f} with params: {best_params}")

Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 0, 'weight_decay': 0.0}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2806,0.179123,0.963964,0.961783
2,0.1354,0.107324,0.96997,0.967742


F1 Score: 0.9705882352941176
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 0, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1512,0.0763,0.981982,0.981013
2,0.0666,0.065049,0.987988,0.987261


F1 Score: 0.9824046920821115
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 0, 'weight_decay': 0.1}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1472,0.078141,0.984985,0.984127
2,0.0738,0.067349,0.984985,0.984127


F1 Score: 0.9882697947214076
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 100, 'weight_decay': 0.0}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1439,0.026901,0.993994,0.993631
2,0.0688,0.016819,0.996997,0.996825


F1 Score: 0.9897810218978103
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 100, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0689,0.072211,0.990991,0.990596
2,0.0498,0.03223,0.993994,0.993711


F1 Score: 0.9855072463768116
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 100, 'weight_decay': 0.1}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.004593,0.996997,0.996825
2,0.0,0.050299,0.993994,0.993711


F1 Score: 0.9883381924198251
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 500, 'weight_decay': 0.0}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.112858,0.990991,0.990596
2,0.0,6e-06,1.0,1.0


F1 Score: 0.9898107714701602
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 500, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,1e-06,1.0,1.0
2,0.0,0.123439,0.990991,0.990596


F1 Score: 0.9912536443148688
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 2, 'warmup_steps': 500, 'weight_decay': 0.1}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.0,1.0,1.0
2,0.0,0.137138,0.990991,0.990596


F1 Score: 0.9912536443148688
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 3, 'warmup_steps': 0, 'weight_decay': 0.0}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.0,1.0,1.0
2,0.0,0.045686,0.996997,0.996825
3,0.0,0.0,1.0,1.0


F1 Score: 0.9898403483309144
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 3, 'warmup_steps': 0, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0
3,0.0,0.012154,0.996997,0.996845


F1 Score: 0.9898403483309144
Testing params: {'batch_size': 8, 'learning_rate': 5e-05, 'num_epochs': 3, 'warmup_steps': 0, 'weight_decay': 0.1}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.0,1.0,1.0


KeyboardInterrupt: 

In [None]:
# Performance on fine-tuned model
test_results = trainer.evaluate(test_dataset)
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")

# Get Predictions on CPU
predictions = trainer.predict(test_dataset).predictions
pred_labels = torch.tensor(predictions).argmax(dim=-1).cpu().numpy()
true_labels = torch.tensor(np.array(test_labels)).cpu().numpy()

# Compute Confusion Matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
# Predict Cancer vs Non-Cancer for new abstracts
def predict(text):
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1).squeeze()
        pred_label = torch.argmax(probs).item()
    return pd.Series(["Cancer" if pred_label == 1 else "Non-Cancer", probs[0].item(), probs[1].item()])

# Apply classification on dataset
df[["Predicted_Category", 'Non-Cancer Score', 'Cancer Score']] = df["Abstract"].apply(predict)

In [None]:
df