In [1]:
import os
import sys

cwd = os.getcwd()
sys.path.append(os.path.abspath(".."))

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from data.data_preprocessing import load_and_preprocess
import tensorflow as tf

# Transformer Sentiment Classification

This notebook implements a Transformer architecture from scratch for sentiment analysis on Amazon product reviews.

## 1. Load and Preprocess Data

In [4]:
# Load Amazon dataset
train, val, test = load_and_preprocess(train_sample_size=100000, val_sample_size=20000)

In [5]:
# Examine the data structure
print("Dataset structure:")
train.head()

Dataset structure:


Unnamed: 0,label,title,text
0,1,Einstein: His Life Revealed,walter isaacson has clearly covered the bases ...
1,0,Totally unreliable measuring directions,the examples of things that can be built are v...
2,1,It's Not Just About the Book,i just finished reading the book as the tour d...
3,0,Playing in the dark,be careful buying this light the invoice and o...
4,0,"Rose-flavored Water, NOT Rosewater",if all you're after is rose flavor for cooking...


In [6]:
print(f"Training size: {len(train)}")
print(f"Validation size: {len(val)}")
print(f"Test size: {len(test)}")

# Check label distribution
print(f"\nTraining label distribution:")
print(train['label'].value_counts())

Training size: 100000
Validation size: 20000
Test size: 180000

Training label distribution:
label
0    50133
1    49867
Name: count, dtype: int64


## 2. Import Transformer Model

In [None]:
# Import our custom Transformer implementation
from src.Simple_transformer import TransformerSentimentClassifier


## 3. Prepare Data for Transformer

In [8]:
# Prepare texts and labels for the transformer
train_texts = train['text'].tolist()
train_labels = train['label'].tolist()

val_texts = val['text'].tolist()
val_labels = val['label'].tolist()

test_texts = test['text'].tolist()
test_labels = test['label'].tolist()

print(f"Sample text: {train_texts[0][:200]}...")
print(f"Sample label: {train_labels[0]}")

Sample text: walter isaacson has clearly covered the bases in researching one of the most fascinating and brilliant human beings ever born but i often got lost in the "nitty gritty" details that at times seemed tr...
Sample label: 1


## 4. Create and Configure Transformer Model

In [9]:
# Model hyperparameters
VOCAB_SIZE = 10000
MAX_LENGTH = 128
D_MODEL = 128
NUM_HEADS = 8
NUM_LAYERS = 4
DFF = 512
DROPOUT_RATE = 0.1
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001

# Create model
model = TransformerSentimentClassifier(
    vocab_size=VOCAB_SIZE,
    max_length=MAX_LENGTH,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dff=DFF,
    rate=DROPOUT_RATE,
    num_classes=1  # Binary classification
)

print(f"Model created with:")
print(f"- Vocabulary size: {VOCAB_SIZE}")
print(f"- Max sequence length: {MAX_LENGTH}")
print(f"- Model dimension: {D_MODEL}")
print(f"- Number of attention heads: {NUM_HEADS}")
print(f"- Number of transformer layers: {NUM_LAYERS}")
print(f"- Feed-forward dimension: {DFF}")

Model created with:
- Vocabulary size: 10000
- Max sequence length: 128
- Model dimension: 128
- Number of attention heads: 8
- Number of transformer layers: 4
- Feed-forward dimension: 512


## 5. Build Vocabulary

In [10]:
# Build vocabulary from training texts
print("Building vocabulary...")
model.build_vocabulary(train_texts, vocab_size=VOCAB_SIZE, max_length=MAX_LENGTH)

print(f"\nVocabulary statistics:")
print(f"Total vocabulary size: {len(model.word_to_idx)}")
print(f"Sample words: {list(model.word_to_idx.keys())[:20]}")

Building vocabulary...
Built vocabulary with 10000 words

Vocabulary statistics:
Total vocabulary size: 10000
Sample words: ['<PAD>', '<UNK>', '<START>', '<END>', 'the', 'and', 'i', 'a', 'to', 'it', 'of', 'this', 'is', 'in', 'for', 'that', 'was', 'you', 'not', 'with']


## 6. Encode Texts

In [11]:
# Encode texts to sequences of token indices
print("Encoding texts...")
X_train = model.encode_texts(train_texts, max_length=MAX_LENGTH)
X_val = model.encode_texts(val_texts, max_length=MAX_LENGTH)
X_test = model.encode_texts(test_texts, max_length=MAX_LENGTH)

# Convert labels to numpy arrays
y_train = np.array(train_labels, dtype=np.float32)
y_val = np.array(val_labels, dtype=np.float32)
y_test = np.array(test_labels, dtype=np.float32)

print(f"Data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# Show example of encoded text
print(f"\nOriginal text: {train_texts[0][:100]}...")
print(f"Encoded (first 20 tokens): {X_train[0][:20]}")

Encoding texts...
Data shapes:
X_train: (100000, 128), y_train: (100000,)
X_val: (20000, 128), y_val: (20000,)
X_test: (180000, 128), y_test: (180000,)

Original text: walter isaacson has clearly covered the bases in researching one of the most fascinating and brillia...
Encoded (first 20 tokens): [6446    1   45  824 1338    4 7727   13 4296   27   10    4  118 1498
    5 1324  853 5199  146 2016]


## 7. Compile Model

In [13]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Build the model with a dummy input to get parameter count
model.build(input_shape=(None, MAX_LENGTH))

print("\n Model Architecture:")
print(f"Input: (batch_size, {MAX_LENGTH}) - Token sequences")
print(f"Embedding: {VOCAB_SIZE} -> {D_MODEL}")
print(f"Positional Encoding: {MAX_LENGTH} positions")
print(f"Transformer Blocks: {NUM_LAYERS} layers")
print(f"  - Multi-Head Attention: {NUM_HEADS} heads")
print(f"  - Feed-Forward: {D_MODEL} -> {DFF} -> {D_MODEL}")
print(f"Global Average Pooling: Sequence aggregation")
print(f"Classification Head: {D_MODEL} -> 256 -> 128 -> 1")
print(f"Output: Binary sentiment prediction")



 Model Architecture:
Input: (batch_size, 128) - Token sequences
Embedding: 10000 -> 128
Positional Encoding: 128 positions
Transformer Blocks: 4 layers
  - Multi-Head Attention: 8 heads
  - Feed-Forward: 128 -> 512 -> 128
Global Average Pooling: Sequence aggregation
Classification Head: 128 -> 256 -> 128 -> 1
Output: Binary sentiment prediction


## 8. Create TensorFlow Datasets

In [14]:
# Create efficient TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("Datasets created successfully!")
print(f"Training batches: {len(list(train_dataset))}")
print(f"Validation batches: {len(list(val_dataset))}")
print(f"Test batches: {len(list(test_dataset))}")

Datasets created successfully!


2025-06-10 09:30:26.858417: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-06-10 09:30:26.911337: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Training batches: 3125
Validation batches: 625
Test batches: 5625


## 9. Setup Training Callbacks

In [15]:
# Training callbacks for better training
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=3,
        restore_best_weights=True,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

print("Training callbacks configured:")
print("- Early stopping on validation accuracy (patience=3)")
print("- Learning rate reduction on plateau (factor=0.5, patience=2)")

Training callbacks configured:
- Early stopping on validation accuracy (patience=3)
- Learning rate reduction on plateau (factor=0.5, patience=2)


## 10. Train the Transformer Model

In [None]:
print(" Starting Transformer Training...")
print("=" * 50)
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Epochs: {EPOCHS}")

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

print("\n Training completed!")

# Count parameters after training
try:
    total_params = model.count_params()
    print(f"Total parameters: {total_params:,}")
    print("Note: Transformer uses self-attention for parallel processing")
except:
    print("Model successfully trained!")

## 11. Visualize Training History

In [None]:
def plot_transformer_training_history(history):
    """Plot training curves for Transformer model"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Loss plot
    ax1.plot(history.history['loss'], label='Train Loss', color='blue')
    ax1.plot(history.history['val_loss'], label='Val Loss', color='red')
    ax1.set_title('Transformer Sentiment Classifier - Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Accuracy plot
    ax2.plot(history.history['accuracy'], label='Train Acc', color='blue')
    ax2.plot(history.history['val_accuracy'], label='Val Acc', color='red')
    ax2.set_title('Transformer Sentiment Classifier - Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

# Plot training history
plot_transformer_training_history(history)

## 12. Evaluate Model Performance

In [None]:
def evaluate_transformer_model(model, test_dataset, y_test):
    """Evaluate the Transformer model on test set"""
    
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(test_dataset, verbose=0)
    
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Get predictions for classification report
    predictions = model.predict(test_dataset, verbose=0)
    predicted_classes = (predictions > 0.5).astype(int).flatten()
    true_classes = y_test.astype(int)
    
    from sklearn.metrics import classification_report, confusion_matrix
    print("\nClassification Report:")
    print(classification_report(true_classes, predicted_classes, 
                              target_names=['Negative', 'Positive']))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(true_classes, predicted_classes)
    print(cm)
    
    return test_accuracy

# Evaluate the model
test_accuracy = evaluate_transformer_model(model, test_dataset, y_test)

## 13. Test Model with Sample Texts

In [None]:
def analyze_transformer_predictions(model, texts, max_length=128):
    """Analyze model predictions on sample texts"""
    print("\n TESTING TRANSFORMER WITH SAMPLE TEXTS")
    print("=" * 50)
    
    for i, text in enumerate(texts):
        # Encode text
        encoded = model.encode_texts([text], max_length=max_length)
        
        # Get prediction
        prediction = model.predict(encoded, verbose=0)
        
        # Interpret prediction
        sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
        confidence = prediction[0][0] if prediction[0][0] > 0.5 else 1 - prediction[0][0]
        
        print(f"\nSample {i+1}:")
        print(f"Text: '{text[:100]}{'...' if len(text) > 100 else ''}'")
        print(f"Prediction: {sentiment} (confidence: {confidence:.3f})")
        print(f"Raw score: {prediction[0][0]:.4f}")

# Test with various sample texts
sample_texts = [
    "This product is absolutely fantastic! I love everything about it.",
    "Terrible quality. Waste of money. Would not recommend.",
    "Pretty good overall, but could be better for the price.",
    "Amazing customer service and fast delivery. Very satisfied!",
    "The product broke after just one day. Very disappointed.",
    "Okay product, nothing special but does the job."
]

analyze_transformer_predictions(model, sample_texts, MAX_LENGTH)