# 1. Environment Setup & Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import optuna
from torch.utils.tensorboard import SummaryWriter

In [2]:
# Download NLTK resources
nltk.download(['stopwords', 'wordnet', 'punkt', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# 2. Data Loading & Preprocessing

In [4]:
# Load and prepare data
df = pd.read_csv('datasets/all_it_jobs.csv')
columns_to_keep = ['review_text', 'sentiment']
df = df[columns_to_keep].dropna(subset=['review_text'])

In [5]:
# Balance dataset with replacement
df_sampled = (df.groupby("sentiment")
              .sample(n=20000, random_state=42, replace=True)
              .reset_index(drop=True))

In [7]:
df_sampled.head(5)

Unnamed: 0,review_text,sentiment
0,depends on teams avoid proddev bonuses are goo...,Negative
1,do not expect more than 15 of hike while joini...,Negative
2,disrespectful and ancient generally nice peopl...,Negative
3,if you don’t want to mail in the next 30 years...,Negative
4,work somewhere else that rewards innovation an...,Negative


In [9]:
# Text preprocessing setup
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(tag[0], wordnet.NOUN)

def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', text.lower())
    tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(tag)) 
              for w, tag in pos_tag(nltk.word_tokenize(text)) 
              if w not in stop_words]
    return ' '.join(tokens)

In [10]:
# Apply preprocessing
df_sampled['cleaned_review'] = df_sampled['review_text'].apply(preprocess_text)
df_sampled = df_sampled[df_sampled['cleaned_review'].str.strip().astype(bool)]

# 3. Data Preparation & Splitting

In [11]:
# Split the data
train_df, temp_df = train_test_split(
    df_sampled, test_size=0.3, stratify=df_sampled['sentiment'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['sentiment'], random_state=42
)

In [12]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_review'])

def prepare_sequences(df):
    sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
    lengths = [min(len(seq), 250) for seq in sequences]
    padded = pad_sequences(sequences, maxlen=250, padding='post', truncating='post')
    return padded, lengths

X_train, train_lengths = prepare_sequences(train_df)
X_val, val_lengths = prepare_sequences(val_df)
X_test, test_lengths = prepare_sequences(test_df)

In [13]:
# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_val = le.transform(val_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# 4. Embedding Layer Preparation

In [14]:
def load_glove_embeddings(path, tokenizer, embed_dim):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    vocab_size = tokenizer.num_words + 1
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    
    for word, i in tokenizer.word_index.items():
        if i >= vocab_size: continue
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
            
    return torch.tensor(embedding_matrix, dtype=torch.float32)

embedding_matrix = load_glove_embeddings('glove.6B.100d.txt', tokenizer, 100)

# 5. PyTorch Dataset & DataLoader

In [15]:
def create_dataloader(X, lengths, y, batch_size=128, shuffle=False):
    dataset = TensorDataset(
        torch.tensor(X, dtype=torch.long),
        torch.tensor(lengths, dtype=torch.long),
        torch.tensor(y, dtype=torch.long)
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)

train_loader = create_dataloader(X_train, train_lengths, y_train, shuffle=True)
val_loader = create_dataloader(X_val, val_lengths, y_val)
test_loader = create_dataloader(X_test, test_lengths, y_test)

# 6. LSTM Model

In [16]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, 
                 num_layers, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False) \
            if pretrained_embeddings is not None \
            else nn.Embedding(vocab_size, embed_dim)
        
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (hidden, _) = self.lstm(packed)
        return self.fc(self.dropout(hidden[-1]))

# 7. Training & Evaluation Functions

In [18]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    
    for X, lengths, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        
        outputs = model(X, lengths)
        loss = criterion(outputs, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        correct += (outputs.argmax(1) == y).sum().item()
        
    return total_loss/len(loader), correct/len(loader.dataset)

In [19]:
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    
    with torch.no_grad():
        for X, lengths, y in loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X, lengths)
            loss = criterion(outputs, y)
            
            total_loss += loss.item()
            correct += (outputs.argmax(1) == y).sum().item()
            
    return total_loss/len(loader), correct/len(loader.dataset)

# 8. Hyperparameter Optimization with Optuna

In [20]:
def objective(trial):
    writer = SummaryWriter()
    
    params = {
        'hidden_dim': trial.suggest_categorical('hidden_dim', [128, 256, 512]),
        'num_layers': trial.suggest_int('num_layers', 1, 3),
        # Conditional dropout - only suggest when num_layers > 1
        'dropout': trial.suggest_float('dropout', 0.1, 0.5) 
                   if trial.suggest_int('num_layers', 1, 3) > 1 
                   else 0.0,
        'lr': trial.suggest_float('lr', 1e-5, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256])
    }
    
    model = SentimentLSTM(
        vocab_size=10001,
        embed_dim=100,
        output_dim=len(le.classes_),
        pretrained_embeddings=embedding_matrix,
        hidden_dim=params['hidden_dim'],
        num_layers=params['num_layers'],
        dropout=params['dropout']
    ).to(device)
    
    optimizer = optim.AdamW(  # Better optimizer
        model.parameters(), 
        lr=params['lr'],
        weight_decay=trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)  # Added
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    criterion = nn.CrossEntropyLoss()
    best_val_acc = 0
    patience = 2
    no_improvement = 0

    for epoch in range(15):  # Slightly increased epochs
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        scheduler.step()
        
        # Report intermediate results for pruning
        trial.report(val_acc, epoch)
        
        # Early stopping and pruning
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Log metrics
        writer.add_scalars('Loss', {'train': train_loss, 'val': val_loss}, epoch)
        writer.add_scalars('Accuracy', {'train': train_acc, 'val': val_acc}, epoch)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break  # Early exit from unpromising trials

    return best_val_acc

In [21]:
# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-01-28 20:34:09,381] A new study created in memory with name: no-name-fd4bdd79-a73d-4b73-aba4-131330fcc7e3
[I 2025-01-28 20:40:10,728] Trial 0 finished with value: 0.645 and parameters: {'hidden_dim': 512, 'num_layers': 2, 'dropout': 0.27456906318386587, 'lr': 0.0001542605632479653}. Best is trial 0 with value: 0.645.
[I 2025-01-28 20:43:29,512] Trial 1 finished with value: 0.6553333333333333 and parameters: {'hidden_dim': 512, 'num_layers': 1, 'dropout': 0.4995289347837444, 'lr': 0.001801517854059461}. Best is trial 1 with value: 0.6553333333333333.
[I 2025-01-28 20:46:46,223] Trial 2 finished with value: 0.6492222222222223 and parameters: {'hidden_dim': 512, 'num_layers': 1, 'dropout': 0.3820320663755761, 'lr': 0.0009542166913596821}. Best is trial 1 with value: 0.6553333333333333.
[I 2025-01-28 20:50:29,815] Trial 3 finished with value: 0.653 and parameters: {'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.3937889137309931, 'lr': 0.00039699537864364376}. Best is trial 1 with

# 9. Final Model Training

In [26]:
best_params = study.best_params
final_model = SentimentLSTM(
    vocab_size=10001,
    embed_dim=100,
    output_dim=len(le.classes_),
    pretrained_embeddings=embedding_matrix,
    hidden_dim=best_params['hidden_dim'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
).to(device)



In [27]:
# Train final model
optimizer = optim.Adam(final_model.parameters(), lr=best_params['lr'])
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    train_loss, train_acc = train_epoch(final_model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(final_model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}')

Epoch 1: Train Loss: 0.8719 Acc: 0.5816 | Val Loss: 0.7895 Acc: 0.6272
Epoch 2: Train Loss: 0.7542 Acc: 0.6523 | Val Loss: 0.7538 Acc: 0.6509
Epoch 3: Train Loss: 0.6935 Acc: 0.6895 | Val Loss: 0.7494 Acc: 0.6504
Epoch 4: Train Loss: 0.6338 Acc: 0.7212 | Val Loss: 0.7757 Acc: 0.6537
Epoch 5: Train Loss: 0.5708 Acc: 0.7551 | Val Loss: 0.8242 Acc: 0.6491


# 10. Final Evaluation

In [28]:
test_loss, test_acc = evaluate(final_model, test_loader, criterion, device)
print(f'\nFinal Test Performance: Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}')


Final Test Performance: Loss: 0.8382 | Accuracy: 0.6419


# 11. Save the best parameters

In [31]:
import json

with open('params/best_bi_lstm_params.json', 'w') as f:
    json.dump(best_params, f)