Mount google drive to import dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import csv
import random
import pandas as pd
from transformers import pipeline
    # Load the dataset with specified encoding
file_path = "/content/drive/MyDrive/cancer_dataset.csv"
df = pd.read_csv(file_path, encoding='latin1')

create a new csv file containing 100 instances from each class.

In [3]:
# Check if the DataFrame is loaded correctly and not empty
if df.empty:

  raise ValueError("The input CSV file is empty or not loaded correctly. Please check the file path and content.")

# Find the unique classes in the dataset
unique_classes = df['Class'].unique()  # Replace 'class_column' with the actual class column name
print(f"Unique classes found: {unique_classes}")

# Function to filter the first 100 instances of each class
def filter_samples(df, classes, num_samples=100):
    filtered_samples = []
    for class_name in classes:
        class_samples = df[df['Class'] == class_name].head(num_samples)  # Replace 'class_column' with the actual class column name
        if class_samples.empty:
            print(f"No samples found for class {class_name}. Please check the class names and column.")
        else:
            filtered_samples.append(class_samples)
    return pd.concat(filtered_samples)

# Filter the first 100 instances of each class
filtered_df = filter_samples(df, unique_classes, 100)

# Check if the filtered DataFrame is not empty
if filtered_df.empty:
    raise ValueError("The filtered DataFrame is empty. Please ensure there are enough instances of each class in the dataset.")

# Save the new dataset to a CSV file
filtered_df.to_csv('hundredSamples.csv', index=False)

# Display the first few rows of the new dataset
print("First few rows of the new dataset:")
print(filtered_df.head())

# Display the shape of the new dataset
print("Shape of the new dataset:", filtered_df.shape)


Unique classes found: ['Thyroid_Cancer' 'Colon_Cancer' 'Lung_Cancer']
First few rows of the new dataset:
   No           Class                                               Text
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...
Shape of the new dataset: (300, 3)


BART

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the new dataset created previously
df_data = pd.read_csv('hundredSamples.csv')

# Define the candidate labels for zero-shot classification
candidate_labels = ["colon cancer", "thyroid cancer", "lung cancer"]

# Load the zero-shot classification pipeline using BioBERT
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if device.type == 'cuda' else -1)  # Using a suitable model from Hugging Face

# Initialize lists to store the results
predicted_labels = []
confidence_scores = []

# Perform zero-shot classification on the Text column
for text in df_data['Text'].tolist():  # Replace 'Text' with the actual column name containing text data
    result = classifier(text, candidate_labels)
    predicted_label = result['labels'][0]  # Predicted label
    confidence_score = result['scores'][0]  # Confidence score
    predicted_labels.append(predicted_label)
    confidence_scores.append(confidence_score)

# Add the results to the DataFrame
df_data['predicted_label'] = predicted_labels
df_data['confidence_score'] = confidence_scores

# Save the updated DataFrame to a new CSV file
df_data.to_csv('bart_hundredSamples.csv', index=False)

# Display the updated DataFrame
print("First few rows of the updated DataFrame with predictions:")
print(df_data.head())


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


First few rows of the updated DataFrame with predictions:
   No           Class                                               Text  \
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...   
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...   
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...   
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...   
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...   

  predicted_label  confidence_score  
0  thyroid cancer          0.371749  
1     lung cancer          0.369161  
2  thyroid cancer          0.348231  
3    colon cancer          0.421600  
4  thyroid cancer          0.463930  


In [6]:
df_bart = pd.read_csv('bart_hundredSamples.csv')
df_bart['Class'] = df_bart['Class'].str.replace('_', ' ').str.lower()

In [7]:
from sklearn.metrics import classification_report, accuracy_score
accuracy = accuracy_score(df_bart['Class'], df_bart['predicted_label'])
print("Accuracy:", accuracy)
true_labels = df_bart['Class']
report = classification_report(true_labels, df_bart['predicted_label'])
print("Classification Report:")
print(report)

Accuracy: 0.5333333333333333
Classification Report:
                precision    recall  f1-score   support

  colon cancer       0.51      0.58      0.54       100
   lung cancer       0.51      0.76      0.61       100
thyroid cancer       0.68      0.26      0.38       100

      accuracy                           0.53       300
     macro avg       0.57      0.53      0.51       300
  weighted avg       0.57      0.53      0.51       300



GPT-2

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the new dataset created previously
df = pd.read_csv('hundredSamples.csv')

# Define the candidate labels for zero-shot classification
candidate_labels = ["colon cancer", "thyroid cancer", "lung cancer"]
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)

# Initialize lists to store the results
predicted_labels = []
confidence_scores = []

# Perform zero-shot classification on the Text column
for text in df['Text'].tolist():  # Replace 'Text' with the actual column name containing text data
    result = classifier(text, candidate_labels)
    predicted_label = result['labels'][0]  # Predicted label
    confidence_score = result['scores'][0]  # Confidence score
    predicted_labels.append(predicted_label)
    confidence_scores.append(confidence_score)

# Add the results to the DataFrame
df['predicted_label'] = predicted_labels
df['confidence_score'] = confidence_scores

# Save the updated DataFrame to a new CSV file
df.to_csv('bart_hundredSamples.csv', index=False)

# Display the updated DataFrame
print("First few rows of the updated DataFrame with predictions:")
print(df.head())


Device: cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`
  scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)


First few rows of the updated DataFrame with predictions:
   No           Class                                               Text  \
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...   
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...   
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...   
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...   
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...   

  predicted_label  confidence_score  
0     lung cancer               NaN  
1     lung cancer               NaN  
2     lung cancer               NaN  
3    colon cancer          0.640211  
4  thyroid cancer          0.621803  


In [None]:
df_gpt = pd.read_csv('bart_hundredSamples.csv')
df_gpt['Class'] = df_gpt['Class'].str.replace('_', ' ').str.lower()


In [None]:
from sklearn.metrics import classification_report, accuracy_score
accuracy = accuracy_score(df_gpt['Class'], df_gpt['predicted_label'])
print("Accuracy:", accuracy)
true_labels = df_gpt['Class']
report = classification_report(true_labels, df_gpt['predicted_label'])
print("Classification Report:")
print(report)

Accuracy: 0.58
Classification Report:
                precision    recall  f1-score   support

  colon cancer       0.70      0.55      0.61       100
   lung cancer       0.57      0.97      0.72       100
thyroid cancer       0.42      0.22      0.29       100

      accuracy                           0.58       300
     macro avg       0.56      0.58      0.54       300
  weighted avg       0.56      0.58      0.54       300



Roberta

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
import torch
import pandas as pd

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the dataset
df = pd.read_csv('hundredSamples.csv')

# Define the candidate labels for classification (these should be the labels your model is trained on)
candidate_labels = ["colon cancer", "thyroid cancer", "lung cancer"]

# Initialize the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large-mnli')
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')

# Move the model to the appropriate device
model.to(device)

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Initialize lists to store the results
predicted_labels = []
confidence_scores = []

# Perform zero-shot classification on the Text column
for text in df['Text'].tolist():  # Replace 'Text' with the actual column name containing text data
    result = classifier(text, candidate_labels)
    predicted_label = result['labels'][0]  # Predicted label
    confidence_score = result['scores'][0]  # Confidence score
    predicted_labels.append(predicted_label)
    confidence_scores.append(confidence_score)

# Add the results to the DataFrame
df['predicted_label'] = predicted_labels
df['confidence_score'] = confidence_scores

# Save the updated DataFrame to a new CSV file
df.to_csv('roberta1_hundredSamples.csv', index=False)

# Display the updated DataFrame
print("First few rows of the updated DataFrame with predictions:")
print(df.head())


Device: cuda


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


First few rows of the updated DataFrame with predictions:
   No           Class                                               Text  \
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...   
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...   
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...   
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...   
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...   

  predicted_label  confidence_score  
0  thyroid cancer          0.786961  
1    colon cancer          0.345437  
2    colon cancer          0.365856  
3    colon cancer          0.341721  
4  thyroid cancer          0.657570  


In [None]:
df_roberta1 = pd.read_csv('roberta1_hundredSamples.csv')
df_roberta1['Class'] = df_roberta1['Class'].str.replace('_', ' ').str.lower()
from sklearn.metrics import classification_report, accuracy_score
accuracy = accuracy_score(df_roberta1['Class'], df_roberta1['predicted_label'])
print("Accuracy:", accuracy)
true_labels = df_roberta1['Class']
report = classification_report(true_labels, df_roberta1['predicted_label'])
print("Classification Report:")
print(report)

Accuracy: 0.49
Classification Report:
                precision    recall  f1-score   support

  colon cancer       0.45      0.37      0.41       100
   lung cancer       0.52      0.68      0.59       100
thyroid cancer       0.49      0.42      0.45       100

      accuracy                           0.49       300
     macro avg       0.48      0.49      0.48       300
  weighted avg       0.48      0.49      0.48       300



BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import pandas as pd

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the dataset
df = pd.read_csv('hundredSamples.csv')

# Define the candidate labels for classification (these should be the labels your model is trained on)
candidate_labels = ["colon cancer", "thyroid cancer", "lung cancer"]

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the appropriate device
model.to(device)

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Initialize lists to store the results
predicted_labels = []
confidence_scores = []

# Perform zero-shot classification on the Text column
for text in df['Text'].tolist():  # Replace 'Text' with the actual column name containing text data
    result = classifier(text, candidate_labels)
    predicted_label = result['labels'][0]  # Predicted label
    confidence_score = result['scores'][0]  # Confidence score
    predicted_labels.append(predicted_label)
    confidence_scores.append(confidence_score)

# Add the results to the DataFrame
df['predicted_label'] = predicted_labels
df['confidence_score'] = confidence_scores

# Save the updated DataFrame to a new CSV file
df.to_csv('bert_hundredSamples.csv', index=False)

# Display the updated DataFrame
print("First few rows of the updated DataFrame with predictions:")
print(df.head())


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


First few rows of the updated DataFrame with predictions:
   No           Class                                               Text  \
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...   
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...   
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...   
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...   
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...   

  predicted_label  confidence_score  
0  thyroid cancer          0.343436  
1     lung cancer          0.334531  
2     lung cancer          0.334961  
3    colon cancer          0.334161  
4  thyroid cancer          0.343634  


In [None]:
df_bert = pd.read_csv('bert_hundredSamples.csv')
df_bert['Class'] = df_bert['Class'].str.replace('_', ' ').str.lower()
from sklearn.metrics import classification_report, accuracy_score
accuracy = accuracy_score(df_bert['Class'], df_bert['predicted_label'])
print("Accuracy:", accuracy)
true_labels = df_bert['Class']
report = classification_report(true_labels, df_bert['predicted_label'])
print("Classification Report:")
print(report)

Accuracy: 0.42
Classification Report:
                precision    recall  f1-score   support

  colon cancer       0.49      0.40      0.44       100
   lung cancer       0.37      0.65      0.47       100
thyroid cancer       0.48      0.21      0.29       100

      accuracy                           0.42       300
     macro avg       0.45      0.42      0.40       300
  weighted avg       0.45      0.42      0.40       300



BioBERT

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the dataset
df = pd.read_csv('hundredSamples.csv')

# Define the candidate labels for classification (these should be the labels your model is trained on)
candidate_labels = ["colon cancer", "thyroid cancer", "lung cancer"]

# Load the BioBERT model via the transformers library
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-v1.1")

# Move the model to the appropriate device
model.to(device)

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Initialize lists to store the results
predicted_labels = []
confidence_scores = []

# Perform zero-shot classification on the Text column
for text in df['Text'].tolist():  # Replace 'Text' with the actual column name containing text data
    result = classifier(text, candidate_labels)
    predicted_label = result['labels'][0]  # Predicted label
    confidence_score = result['scores'][0]  # Confidence score
    predicted_labels.append(predicted_label)
    confidence_scores.append(confidence_score)

# Add the results to the DataFrame
df['predicted_label'] = predicted_labels
df['confidence_score'] = confidence_scores

# Save the updated DataFrame to a new CSV file
df.to_csv('BioBERT_hundredSamples.csv', index=False)

# Display the updated DataFrame
print("First few rows of the updated DataFrame with predictions:")
print(df.head())


Device: cuda


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


First few rows of the updated DataFrame with predictions:
   No           Class                                               Text  \
0   0  Thyroid_Cancer  Thyroid surgery in  children in a single insti...   
1   1  Thyroid_Cancer  " The adopted strategy was the same as that us...   
2   2  Thyroid_Cancer  coronary arterybypass grafting thrombosis ï¬b...   
3   3  Thyroid_Cancer   Solitary plasmacytoma SP of the skull is an u...   
4   4  Thyroid_Cancer   This study aimed to investigate serum matrix ...   

  predicted_label  confidence_score  
0    colon cancer          0.425390  
1     lung cancer          0.345232  
2    colon cancer          0.336900  
3    colon cancer          0.435938  
4    colon cancer          0.439657  


In [None]:
df_biobert = pd.read_csv('BioBERT_hundredSamples.csv')
df_biobert['Class'] = df_biobert['Class'].str.replace('_', ' ').str.lower()
from sklearn.metrics import classification_report, accuracy_score
accuracy = accuracy_score(df_biobert['Class'], df_biobert['predicted_label'])
print("Accuracy:", accuracy)
true_labels = df_biobert['Class']
report = classification_report(true_labels, df_biobert['predicted_label'])
print("Classification Report:")
print(report)

Accuracy: 0.25666666666666665
Classification Report:
                precision    recall  f1-score   support

  colon cancer       0.27      0.49      0.35       100
   lung cancer       0.37      0.18      0.24       100
thyroid cancer       0.15      0.10      0.12       100

      accuracy                           0.26       300
     macro avg       0.26      0.26      0.24       300
  weighted avg       0.26      0.26      0.24       300



pritamdeka/BioBert-PubMed200kRCT - truncate error

medicalai/ClinicalBERT - truncate error

BlueBERT- truncate error