In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv('/kaggle/input/finalized-robertadataset/FinalizedSkinDiseaseDataset.csv', on_bad_lines="skip")
df.columns = df.columns.str.strip()
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.columns = ['Disease name', 'Text']  # Rename columns
df['Disease name'] = df['Disease name'].str.strip('"')
df['Text'] = df['Text'].str.strip('"')
df['Disease name'] = df['Disease name'].astype('category')

# Label mapping
label_map = {label: i for i, label in enumerate(df['Disease name'].unique())}
df['Disease name'] = df['Disease name'].map(label_map)

# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'], df['Disease name'], test_size=0.2, random_state=42
)

# Use RoBERTa Model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256)

# Compute class weights
class_weights = compute_class_weight(
    'balanced', classes=np.array(df['Disease name'].unique()), y=df['Disease name']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Dataset class
class SkinDiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

# DataLoader
train_loader = DataLoader(SkinDiseaseDataset(train_encodings, train_labels), batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(SkinDiseaseDataset(val_encodings, val_labels), batch_size=16, num_workers=4)

# Load RoBERTa model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8, weight_decay=0.01)
num_train_steps = len(train_loader) * 20
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*num_train_steps, num_training_steps=num_train_steps)

# Training function
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss, correct_preds, total_preds = 0, 0, 0
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        correct_preds += (preds == batch['labels']).sum().item()
        total_preds += len(batch['labels'])
    return total_loss / len(data_loader), correct_preds / total_preds

# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    total_preds, total_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(batch['labels'].cpu().numpy())
    return (
        accuracy_score(total_labels, total_preds),
        precision_score(total_labels, total_preds, average='weighted'),
        recall_score(total_labels, total_preds, average='weighted'),
        f1_score(total_labels, total_preds, average='weighted')
    )

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Training loop with early stopping
best_val_accuracy, patience, epochs_without_improvement = 0, 3, 0
epochs = 50

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    val_accuracy, precision, recall, f1 = evaluate(model, val_loader, device)
    print(f"Validation Accuracy: {val_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print('-' * 50)
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        epochs_without_improvement = 0
        model.save_pretrained('./roberta_skin_disease_model')
        tokenizer.save_pretrained('./roberta_skin_disease_model')
    else:
        epochs_without_improvement += 1
    
    if epochs_without_improvement >= patience:
        print("Early stopping triggered")
        break

# Final save
model.save_pretrained('./roberta_skin_disease_model')
tokenizer.save_pretrained('./roberta_skin_disease_model')
print("Fine-tuning completed and RoBERTa model saved.")


Epoch 1/50
Train Loss: 0.1712, Train Accuracy: 0.9748
Validation Accuracy: 0.8994, Precision: 0.9134, Recall: 0.8994, F1-Score: 0.9015
--------------------------------------------------
Epoch 2/50
Train Loss: 0.1556, Train Accuracy: 0.9764
Validation Accuracy: 0.8868, Precision: 0.9221, Recall: 0.8868, F1-Score: 0.8876
--------------------------------------------------
Epoch 3/50
Train Loss: 0.1394, Train Accuracy: 0.9858
Validation Accuracy: 0.8868, Precision: 0.9037, Recall: 0.8868, F1-Score: 0.8849
--------------------------------------------------
Epoch 4/50
Train Loss: 0.1300, Train Accuracy: 0.9843
Validation Accuracy: 0.8931, Precision: 0.8976, Recall: 0.8931, F1-Score: 0.8928
--------------------------------------------------
Early stopping triggered
Fine-tuning completed and RoBERTa model saved.


In [5]:
%cd /kaggle/working

/kaggle/working


In [6]:
import shutil

# Create a zip file of the model directory
shutil.make_archive('/kaggle/working/roberta_skin_disease_model', 'zip', '/kaggle/working', 'roberta_skin_disease_model')

'/kaggle/working/roberta_skin_disease_model.zip'

In [7]:
from IPython.display import FileLink

FileLink('roberta_skin_disease_model.zip')