In [2]:
import pandas as pd
# Step 1: Load and preprocess the dataset
# Load the dataset
df = pd.read_csv('Dataset_1/it_jobs.csv')

# Drop rows with missing values in relevant columns
df = df.dropna(subset=['review_text', 'sentiment'])

# Map sentiment to binary labels
label_map = {"Neutral": 2, "Positive": 1, "Negative": 0}  # Modify as per dataset
df['label'] = df['sentiment'].map(label_map)

# Balance the dataset by sampling 5000 samples per class
df_sampled = (
    df.groupby('label').sample(n=5000, random_state=42).reset_index(drop=True)
)
# Verify the resulting counts
print(df_sampled["label"].value_counts())


ValueError: need at least one array to concatenate

In [2]:
# Step 2: Preprocess the text minimally (no stopword removal or lemmatization)
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()    # Remove extra whitespace
    return text

df_sampled['cleaned_review'] = df_sampled['review_text'].apply(preprocess_text)

In [3]:
# Step 3: Tokenize the text

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the cleaned reviews
tokenized_reviews = tokenizer(
    list(df_sampled['cleaned_review']),
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

# Create tensors for input IDs, attention masks, and labels
input_ids = tokenized_reviews['input_ids']
attention_mask = tokenized_reviews['attention_mask']
labels = torch.tensor(df_sampled['label'].values, dtype=torch.long)

In [4]:
# Step 4: Split the data into training, validation, and test sets

from sklearn.model_selection import train_test_split
train_inputs, temp_inputs, train_masks, temp_masks, train_labels, temp_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.3, random_state=42
)
val_inputs, test_inputs, val_masks, test_masks, val_labels, test_labels = train_test_split(
    temp_inputs, temp_masks, temp_labels, test_size=0.5, random_state=42
)

In [5]:
# Step 5: Create DataLoader objects for training, validation, and test sets

from torch.utils.data import DataLoader, TensorDataset
batch_size = 32

def create_dataloader(inputs, masks, labels, batch_size, shuffle=False):
    data = TensorDataset(inputs, masks, labels)
    return DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=4)

train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, batch_size, shuffle=True)
val_dataloader = create_dataloader(val_inputs, val_masks, val_labels, batch_size)
test_dataloader = create_dataloader(test_inputs, test_masks, test_labels, batch_size)

In [6]:
# Step 6a: Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Step 6b: Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
# Step 7: Define optimizer, scheduler, and loss function
from torch import nn, optim
from transformers import get_scheduler

optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_function = nn.CrossEntropyLoss()
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

In [9]:
# Step 8a: Define training functions
def train(model, dataloader, optimizer, scheduler, loss_function, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = [t.to(device) for t in batch]

        optimizer.zero_grad()

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples * 100
    return avg_loss, accuracy

In [10]:
# Step 8b: Define evaluation functions
def evaluate(model, dataloader, loss_function, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = [t.to(device) for t in batch]

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == batch_labels).sum().item()
            total_samples += batch_labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples * 100
    return avg_loss, accuracy

In [11]:
# Step 9: Training loop
num_epochs = 3

for epoch in range(num_epochs):
    print(f"\nStarting Epoch {epoch + 1}...")

    train_loss, train_accuracy = train(model, train_dataloader, optimizer, scheduler, loss_function, device)
    print(f"Epoch {epoch + 1} - Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%")
    val_loss, val_accuracy = evaluate(model, val_dataloader, loss_function, device)
    print(f"Epoch {epoch + 1} - Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")


Starting Epoch 1...
Epoch 1 - Training Loss: 0.5758, Training Accuracy: 69.29%
Epoch 1 - Validation Loss: 0.4842, Validation Accuracy: 77.05%

Starting Epoch 2...
Epoch 2 - Training Loss: 0.4480, Training Accuracy: 79.22%
Epoch 2 - Validation Loss: 0.4666, Validation Accuracy: 78.29%

Starting Epoch 3...
Epoch 3 - Training Loss: 0.4084, Training Accuracy: 82.00%
Epoch 3 - Validation Loss: 0.4691, Validation Accuracy: 78.38%


In [12]:
# Step 10: Evaluate on test data
test_loss, test_accuracy = evaluate(model, test_dataloader, loss_function, device)
print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")



Test Loss: 0.4710, Test Accuracy: 78.48%
