# 1. Environment Setup & Imports

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import optuna
from torch.utils.tensorboard import SummaryWriter

In [7]:
# Download NLTK resources
nltk.download(['stopwords', 'wordnet', 'punkt', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# 2. Data Loading & Preprocessing

In [9]:
# Load and prepare data
df = pd.read_csv('datasets/all_it_jobs.csv')
columns_to_keep = ['review_text', 'sentiment']
df = df[columns_to_keep].dropna(subset=['review_text'])
# Balance dataset with replacement
df_sampled = (df.groupby("sentiment")
              .sample(n=20000, random_state=42, replace=True)
              .reset_index(drop=True))
df_sampled.head(5)
# Text preprocessing setup
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(tag):
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(tag[0], wordnet.NOUN)


def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', text.lower())
    tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(tag))
              for w, tag in pos_tag(nltk.word_tokenize(text))
              if w not in stop_words]
    return ' '.join(tokens)


# Apply preprocessing
df_sampled['cleaned_review'] = df_sampled['review_text'].apply(preprocess_text)
df_sampled = df_sampled[df_sampled['cleaned_review'].str.strip().astype(bool)]


# 3. Data Preparation & Splitting

In [10]:
# Split the data
train_df, temp_df = train_test_split(
    df_sampled, test_size=0.3, stratify=df_sampled['sentiment'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['sentiment'], random_state=42
)
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_review'])


def prepare_sequences(df):
    sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
    lengths = [min(len(seq), 250) for seq in sequences]
    padded = pad_sequences(sequences, maxlen=250, padding='post', truncating='post')
    return padded, lengths


X_train, train_lengths = prepare_sequences(train_df)
X_val, val_lengths = prepare_sequences(val_df)
X_test, test_lengths = prepare_sequences(test_df)
# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_val = le.transform(val_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# 4. Embedding Layer Preparation

In [11]:
def load_glove_embeddings(path, tokenizer, embed_dim):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector

    vocab_size = tokenizer.num_words + 1
    embedding_matrix = np.zeros((vocab_size, embed_dim))

    for word, i in tokenizer.word_index.items():
        if i >= vocab_size: continue
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]

    return torch.tensor(embedding_matrix, dtype=torch.float32)


embedding_matrix = load_glove_embeddings('glove.6B.100d.txt', tokenizer, 100)


# 5. PyTorch Dataset & DataLoader

In [12]:
def create_dataloader(X, lengths, y, batch_size=128, shuffle=False):
    dataset = TensorDataset(
        torch.tensor(X, dtype=torch.long),
        torch.tensor(lengths, dtype=torch.long),
        torch.tensor(y, dtype=torch.long)
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)


train_loader = create_dataloader(X_train, train_lengths, y_train, shuffle=True)
val_loader = create_dataloader(X_val, val_lengths, y_val)
test_loader = create_dataloader(X_test, test_lengths, y_test)

# 6. LSTM Model

In [13]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim,
                 num_layers, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False) \
            if pretrained_embeddings is not None \
            else nn.Embedding(vocab_size, embed_dim)

        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                            dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        # Forward pass through LSTM
        _, (hidden, _) = self.lstm(packed)
        
        # Split the hidden state into forward and backward directions
        num_layers = self.lstm.num_layers
        # First num_layers entries are forward, next num_layers are backward
        hidden_forward = hidden[:num_layers, :, :]
        hidden_backward = hidden[num_layers:, :, :]
        
        # Extract the last hidden states from both directions
        last_hidden_forward = hidden_forward[-1, :, :]  # (batch_size, hidden_dim)
        last_hidden_backward = hidden_backward[-1, :, :]  # (batch_size, hidden_dim)
        
        # Concatenate the final forward and backward hidden states
        combined = torch.cat((last_hidden_forward, last_hidden_backward), dim=1)
        
        return self.fc(self.dropout(combined))

# 7. Training & Evaluation Functions

In [14]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0

    for X, lengths, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        outputs = model(X, lengths)
        loss = criterion(outputs, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == y).sum().item()

    return total_loss / len(loader), correct / len(loader.dataset)


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for X, lengths, y in loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X, lengths)
            loss = criterion(outputs, y)

            total_loss += loss.item()
            correct += (outputs.argmax(1) == y).sum().item()

    return total_loss / len(loader), correct / len(loader.dataset)

# 8. Hyperparameter Optimization with Optuna

In [17]:
def objective(trial):
    writer = SummaryWriter()
    
    params = {
        'hidden_dim': trial.suggest_categorical('hidden_dim', [128, 256]),  # Adjusted
        'num_layers': trial.suggest_int('num_layers', 1, 2),  # Reduced max layers
        'dropout': trial.suggest_float('dropout', 0.2, 0.6)  # Higher dropout
                  if (num_layers := trial.suggest_int('num_layers', 1, 2)) > 1
                  else 0.0,
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),  # Tighter LR range
        'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True),
    }

    model = BiLSTM(
        vocab_size=10001,
        embed_dim=100,
        output_dim=len(le.classes_),
        pretrained_embeddings=embedding_matrix,
        hidden_dim=params['hidden_dim'],
        num_layers=params['num_layers'],
        dropout=params['dropout']
    ).to(device)

    optimizer = optim.AdamW(  # Better optimizer
        model.parameters(),
        lr=params['lr'],
        weight_decay=params['weight_decay']
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    criterion = nn.CrossEntropyLoss()
    best_val_acc = 0
    patience = 2
    no_improvement = 0

    for epoch in range(8):  # Slightly increased epochs
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        scheduler.step()

        # Report intermediate results for pruning
        trial.report(val_acc, epoch)

        # Early stopping and pruning
        if trial.should_prune():
            raise optuna.TrialPruned()

        # Log metrics
        writer.add_scalars('Loss', {'train': train_loss, 'val': val_loss}, epoch)
        writer.add_scalars('Accuracy', {'train': train_acc, 'val': val_acc}, epoch)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break  # Early exit from unpromising trials

    return best_val_acc


# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-01-29 14:34:40,783] A new study created in memory with name: no-name-5e017af8-a1a2-4e61-a824-6bc60c249e3b
[I 2025-01-29 14:39:09,986] Trial 0 finished with value: 0.6188888888888889 and parameters: {'hidden_dim': 256, 'num_layers': 2, 'dropout': 0.43990518765075687, 'lr': 2.8961622775096578e-05, 'weight_decay': 5.587093958780898e-05}. Best is trial 0 with value: 0.6188888888888889.
[I 2025-01-29 14:41:59,376] Trial 1 finished with value: 0.6108888888888889 and parameters: {'hidden_dim': 128, 'num_layers': 1, 'lr': 2.9763195717810546e-05, 'weight_decay': 0.0001793600528400798}. Best is trial 0 with value: 0.6188888888888889.
[I 2025-01-29 14:46:03,699] Trial 2 finished with value: 0.6284444444444445 and parameters: {'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.27804327781772736, 'lr': 6.248812529954186e-05, 'weight_decay': 3.0910510189933214e-06}. Best is trial 2 with value: 0.6284444444444445.
[I 2025-01-29 14:50:21,708] Trial 3 finished with value: 0.6606666666666666 and p

# 9. Final Model Training

In [19]:
best_params = study.best_params
final_model = BiLSTM(
    vocab_size=10001,
    embed_dim=100,
    output_dim=len(le.classes_),
    pretrained_embeddings=embedding_matrix,
    hidden_dim=best_params['hidden_dim'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
).to(device)
# Train final model
optimizer = optim.Adam(final_model.parameters(), lr=best_params['lr'])
criterion = nn.CrossEntropyLoss()

for epoch in range(6):
    train_loss, train_acc = train_epoch(final_model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(final_model, val_loader, criterion, device)
    print(
        f'Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}')

Epoch 1: Train Loss: 0.8693 Acc: 0.5802 | Val Loss: 0.7903 Acc: 0.6270
Epoch 2: Train Loss: 0.7825 Acc: 0.6323 | Val Loss: 0.7690 Acc: 0.6400
Epoch 3: Train Loss: 0.7497 Acc: 0.6537 | Val Loss: 0.7513 Acc: 0.6507
Epoch 4: Train Loss: 0.7234 Acc: 0.6679 | Val Loss: 0.7679 Acc: 0.6418
Epoch 5: Train Loss: 0.6994 Acc: 0.6824 | Val Loss: 0.7617 Acc: 0.6451
Epoch 6: Train Loss: 0.6806 Acc: 0.6959 | Val Loss: 0.7572 Acc: 0.6589


# 10. Final Evaluation

In [20]:
test_loss, test_acc = evaluate(final_model, test_loader, criterion, device)
print(f'\nFinal Test Performance: Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}')


Final Test Performance: Loss: 0.7615 | Accuracy: 0.6493


# 11. Save the best parameters

In [21]:
import json

with open('params/best_bi_lstm_params.json', 'w') as f:
    json.dump(best_params, f)