In [3]:
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
import torch
import os

/kaggle/input/imdb-dataset

In [4]:
# Load the labeled training dataset from a CSV file
train_dataset_path = '/kaggle/input/imdb-dataset/train.csv'
df_train = pd.read_csv(train_dataset_path)

# Load the labeled test dataset from a CSV file
test_dataset_path = '/kaggle/input/imdb-dataset/test.csv'
df_test = pd.read_csv(test_dataset_path)

# Split the training dataset into training and validation sets
train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)

In [5]:
# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, review, sentiment, tokenizer, max_length=128):
        self.review = review
        self.sentiment = sentiment
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.review)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.review[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Convert string labels to integers
        label_mapping = {'positive': 1.0, 'negative': 0.0}
        labels = torch.tensor(label_mapping[self.sentiment[idx]])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [6]:
print(df_train.columns)

Index(['review', 'sentiment'], dtype='object')


In [7]:
print(df_test.columns)


Index(['review', 'sentiment'], dtype='object')


In [8]:
# Define the model, tokenizer, and optimizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
optimizer = AdamW(model.parameters(), lr=1e-5)

# Prepare the datasets
train_dataset = SentimentDataset(train_df['review'].tolist(), train_df['sentiment'].tolist(), tokenizer)
val_dataset = SentimentDataset(val_df['review'].tolist(), val_df['sentiment'].tolist(), tokenizer)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
# Training loop
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

loss_fn = BCEWithLogitsLoss()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []

    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Sigmoid activation for binary classification
        logits = torch.sigmoid(outputs.logits)
        predictions.extend(logits.cpu().detach().numpy())
        true_labels.extend(labels.cpu().detach().numpy())

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    accuracy = accuracy_score(true_labels, [1 if p > 0.5 else 0 for p in predictions])
    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {average_loss}, Accuracy: {accuracy}")

    # Validation
    model.eval()
    val_loss = 0
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}

            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            labels = inputs['labels']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Sigmoid activation for binary classification
            logits = torch.sigmoid(outputs.logits)
            val_predictions.extend(logits.cpu().detach().numpy())
            val_true_labels.extend(labels.cpu().detach().numpy())

    average_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_true_labels, [1 if p > 0.5 else 0 for p in val_predictions])
    print(f"Epoch {epoch + 1}/{epochs} - Average Validation Loss: {average_val_loss}, Accuracy: {val_accuracy}")

Epoch 1/5 - Average Training Loss: 0.11796562910825015, Accuracy: 0.5685833333333333
Epoch 1/5 - Average Validation Loss: 0.0996247685427467, Accuracy: 0.7436666666666667
Epoch 2/5 - Average Training Loss: 0.07919863963748018, Accuracy: 0.605375
Epoch 2/5 - Average Validation Loss: 0.09730752332198123, Accuracy: 0.5513333333333333
Epoch 3/5 - Average Training Loss: 0.052975510714575645, Accuracy: 0.6324583333333333
Epoch 3/5 - Average Validation Loss: 0.09687836362731954, Accuracy: 0.7773333333333333
Epoch 4/5 - Average Training Loss: 0.03510834017007922, Accuracy: 0.6620416666666666
Epoch 4/5 - Average Validation Loss: 0.10258476559096016, Accuracy: 0.6913333333333334
Epoch 5/5 - Average Training Loss: 0.02466474009906718, Accuracy: 0.6735
Epoch 5/5 - Average Validation Loss: 0.10369347152711512, Accuracy: 0.679


In [10]:
# Prepare the test dataset
test_dataset = SentimentDataset(df_test['review'].tolist(), df_test['sentiment'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [11]:
# Testing loop
model.eval()
test_predictions = []
test_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = torch.sigmoid(outputs.logits)
        test_predictions.extend(logits.cpu().detach().numpy())
        test_true_labels.extend(labels.cpu().detach().numpy())

# Calculate accuracy for the test dataset
test_accuracy = accuracy_score(test_true_labels, [1 if p > 0.5 else 0 for p in test_predictions])
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.6807


In [12]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_distilbert-base-uncased_model')
tokenizer.save_pretrained('fine_tuned_distilbert-base-uncased_model')

('fine_tuned_distilbert-base-uncased_model/tokenizer_config.json',
 'fine_tuned_distilbert-base-uncased_model/special_tokens_map.json',
 'fine_tuned_distilbert-base-uncased_model/vocab.txt',
 'fine_tuned_distilbert-base-uncased_model/added_tokens.json',
 'fine_tuned_distilbert-base-uncased_model/tokenizer.json')

In [13]:
model_path = '/kaggle/working/Distillbert_finetuning'
torch.save(model.state_dict(), model_path)